def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def indivHMM(bambara): tag_set= set() symbols=set() for i in bambara.train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins)) print("HMM accuracy:",hmm.evaluate(bambara.test_sents)) return hmm
def train(train_set, word_types, tag_set): """ Training... Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test) """ trainer = HiddenMarkovModelTrainer(list(tag_set), list( word_types)) # tag_set and word_types are sets: I need to create lists # GoodTuring smoothing # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html # http://en.wikipedia.org/wiki/Additive_smoothing hmm = trainer.train_supervised( train_set, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins)) return hmm
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train(self, labeled_sequence): def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(_identity, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = HiddenMarkovModelTagger( hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=_identity, ) self.tagger = hmm
vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags, projection=projection) code2model = dict() fold_models.append(code2model) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)
for i in range(len(C1_train_cipher)): C1_sequences.append(zip(C1_train_cipher[i],C1_train_plain[i])) for i in range(len(C2_train_cipher)): C2_sequences.append(zip(C2_train_cipher[i],C2_train_plain[i])) for i in range(len(C3_train_cipher)): C3_sequences.append(zip(C3_train_cipher[i],C3_train_plain[i])) trainer = HiddenMarkovModelTrainer(symbols,states) print("################## Analysis of Ciphers without improved Plaintext modelling ####################### \n") if(laplace_mode): print("################## Laplace ####################### \n") C1_tagger = trainer.train_supervised(C1_sequences, estimator= nltk.probability.LaplaceProbDist) C2_tagger = trainer.train_supervised(C2_sequences, estimator= nltk.probability.LaplaceProbDist) C3_tagger = trainer.train_supervised(C3_sequences, estimator= nltk.probability.LaplaceProbDist) else: C1_tagger = trainer.train_supervised(C1_sequences) C2_tagger = trainer.train_supervised(C2_sequences) C3_tagger = trainer.train_supervised(C3_sequences) C1_tester = [] C2_tester = [] C3_tester = [] for i in range(len(C1_test_cipher)): C1_tester.append(zip(C1_test_cipher[i],C1_test_plain[i])) for i in range(len(C2_test_cipher)): C2_tester.append(zip(C2_test_cipher[i],C2_test_plain[i])) for i in range(len(C3_test_cipher)):
wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (hmm_model_prefix, CV_FOLDS, fold, code, str(STEM)) if os.path.exists(hmm_fname): with open(hmm_fname, "rb") as f: base_tagger = dill.load(f) else: hmm_trainer = HiddenMarkovModelTrainer() base_tagger = hmm_trainer.train_supervised(td) with open(hmm_fname, "wb") as f: dill.dump(base_tagger, f) #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/ #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface trainer = BrillTaggerTrainer(base_tagger, templates, deterministic=True) model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd))
print(i,j) if HMM._tag_to_index[pred_ys[j]] == HMM._tag_to_index[Ys[i][j]]: c+=1 total+=1 # print(confuse.shape) # df = pd.DataFrame(confuse) # # columns=list(self._tag_to_index.keys()),index=list(self._tag_to_index.keys()) # sn.heatmap(df) # plt.show() # print(confuse) A = sum([confuse[i][i] for i in range(len(HMM._index_to_tag))])/total from nltk.tag.hmm import HiddenMarkovModelTrainer trainer = HiddenMarkovModelTrainer() tagger = trainer.train_supervised(data) c=0 total = 0 # print(len(tagger._symbols)) # print(len(HMM._index_to_word)) # print(len(tagger._states)) # print(len(HMM._index_to_tag)) print([k[1] for k in HMM.predict(Xs[27])]) print([k[1] for k in tagger.tag(Xs[27])]) print(Ys[27]) for i in range(len(Xs)): pred_ys = [k[1] for k in tagger.tag(Xs[i])] for j in range(len(Xs[i])): # confuse[tagger._tag_to_index[pred_ys[j]]][tagger._tag_to_index[Ys[i][j]]]+=1 if pred_ys[j]==Ys[i][j]:
def trainALL(self, last): self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) crf = CRFTagger(training_opt={"max_iterations": 100, "max_linesearch": 10, "c1": 0.0001, "c2": 1.0}) crf_trained = crf.train( train_sents, "Models/model.crfCrossValidation1" + str(k) + self.option_tone + self.option_tag + ".tagger", ) print(str(k) + " fold: crf") tnt_tagger = tnt.TnT(unk=DefaultTagger("n"), Trained=True, N=100) tnt_tagger.train(train_sents) print(str(k) + " fold: tnt") tag_set = set() symbols = set() for i in train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(train_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print(str(k) + " fold: hmm") if last == "U": lasttagger = UnigramTagger(train_sents, backoff=DefaultTagger("n")) print(str(k) + " fold: unigram") if last == "B": if self.option_tone == "tonal" and self.option_tag == "Affixes": regex = RegexpTonalSA(DefaultTagger("n")) if self.option_tone == "tonal" and self.option_tag == "POS": regex = RegexpTonal(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "Affixes": regex = RegexpSA(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "POS": regex = Regexp(DefaultTagger("n")) dic = dictionary_backoff(self.option_tone, regex) affix = AffixTagger(train_sents, min_stem_length=0, affix_length=-4, backoff=dic) lasttagger = BigramTagger(train_sents, backoff=affix) print(str(k) + " fold: bigram") to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.crf_tagged += crf.tag_sents(to_tag) self.tnt_tagged += tnt_tagger.tag_sents(to_tag) self.hmm_tagged += hmm.tag_sents(to_tag) self.lasttagger_tagged += lasttagger.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.crf = crf self.tnt = tnt_tagger self.hmm = hmm self.lasttagger = lasttagger org_words = sum(self.org_tagged, []) self.crf_avg_acc = accuracy(org_words, sum(self.crf_tagged, [])) self.tnt_avg_acc = accuracy(org_words, sum(self.tnt_tagged, [])) self.hmm_avg_acc = accuracy(org_words, sum(self.hmm_tagged, [])) self.lasttagger_avg_acc = accuracy(org_words, sum(self.lasttagger_tagged, [])) print("Accuracy of concatenated crf-tagged sentences: ", self.crf_avg_acc) print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc) print("Accuracy of concatenated hmm-tagged sentences: ", self.hmm_avg_acc) print("Accuracy of concatenated " + last + "-tagged sentences: ", self.lasttagger_avg_acc) (self.crf_tagprecision, self.crf_tagrecall) = self.tagprecision_recall(crf, self.crf_tagged, self.org_tagged) (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall( tnt_tagger, self.tnt_tagged, self.org_tagged ) (self.hmm_tagprecision, self.hmm_tagrecall) = self.tagprecision_recall(hmm, self.hmm_tagged, self.org_tagged) (self.lasttagger_tagprecision, self.lasttagger_tagrecall) = self.tagprecision_recall( lasttagger, self.lasttagger_tagged, self.org_tagged ) self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))
sentences.append(sentence) sentence = [] word = [conllu_array[1], conllu_array[3]] sentence.append((conllu_array[1], conllu_array[3])) return sentences sentences = tokenize_conllu_file('../en-ud-dev.conllu') cutoff = int(.9 * len(sentences)) training_sentences = sentences[:cutoff] test_sentences = sentences[cutoff:] print('Training Sentences : %d ' % (len(training_sentences))) print('Testing Sentences : %d ' % (len(test_sentences))) print 'Training Start' trainer = HiddenMarkovModelTrainer() tagger = trainer.train_supervised(training_sentences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Training Completed' print 'Testing Start' tagger.test(test_sentences, verbose=False) print 'Testing Completed' import dill with open('my_tagger.dill', 'wb') as f: dill.dump(tagger, f)
def train_hmm_model(labeled_names): states = ["O", "C"] symbols = list(set([ss[0] for sss in labeled_names for ss in sss])) hmm_trainer = HiddenMarkovModelTrainer(states=states, symbols=symbols) hmm = hmm_trainer.train_supervised([labeled_names]) return hmm
wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % ( hmm_model_prefix, CV_FOLDS, fold, code, str(STEM)) if os.path.exists(hmm_fname): with open(hmm_fname, "rb") as f: base_tagger = dill.load(f) else: hmm_trainer = HiddenMarkovModelTrainer() base_tagger = hmm_trainer.train_supervised(td) with open(hmm_fname, "wb") as f: dill.dump(base_tagger, f) #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/ #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface trainer = BrillTaggerTrainer(base_tagger, templates, deterministic=True) model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)
def backoff_tagger(num_tagger, bambara, option_tones="tonal", option_tag="POS",backoff=defaultTagger): """ backoff_tagger of the NLTK cookbook [adapted] """ taggers = [] for i in num_tagger: if i == 0: taggers = taggers + [UnigramTagger] if i == 1: taggers = taggers + [BigramTagger] if i == 2: taggers = taggers + [TrigramTagger] if i == 3: taggers = taggers + [QuadgramTagger] if i == 4: taggers+=["crf"] if i == 5: taggers+=["regexp"] if i == 6: taggers+=["dic"] if i == 8: taggers+=["affix"] if i == 9: taggers+=["tnt"] if i == 10: taggers+=["hmm"] #CRF and HMM both do not accept backoff and therefore can only be the last tagger in a backoff chain # -> DefaultTagger has to be substituted if "hmm" in taggers: tag_set= set() symbols=set() for i in bambara.train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins)) backoff = hmm taggers.remove("hmm") if "crf" in taggers: backoff = indivCRF(bambara, tone, tag) backoff.train(bambara.train_sents,"model.crfbackoff"+option_tag+option_tones+".tagger") backoff.set_model_file("model.crfbackoff"+option_tag+option_tones+".tagger") taggers.remove("crf") for cls in taggers: if cls != "tnt" and cls!="affix" and cls!="regexp" and cls!="dic": backoff1 = backoff backoff = cls(bambara.train_sents, backoff=backoff1) #print(backoff._taggers) else: if cls == "dic": backoff=dictionary_backoff(option_tones, backoff=backoff) if cls == "regexp": if option_tones == "tonal" and option_tag == "Affixes": backoff=RegexpTonalSA(backoff=backoff) if option_tones == "tonal" and option_tag == "POS": backoff=RegexpTonal(backoff=backoff) if option_tones == "nontonal" and option_tag == "Affixes": backoff=RegexpSA(backoff=backoff) if option_tones == "nontonal" and option_tag == "POS": backoff=Regexp(backoff=backoff) if cls == "affix": backoff = AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=-4, backoff = backoff) if cls == "tnt": backoff = tnt.TnT(unk=backoff, Trained= True, N=100) backoff.train(bambara.train_sents) return backoff
# coding:utf-8 from nltk.tag.hmm import HiddenMarkovModelTrainer line_count = 0 pairs = [] with open("../data/may_norm_sentences.txt", "r") as f: for line in f: if line_count % 1000 == 0: print line_count line_count += 1 splitted = line.strip().decode("utf-8").split(';') pairs.append(zip(splitted[1].split(" "), splitted[0].split(" "))) trainer = HiddenMarkovModelTrainer() tagger = trainer.train_supervised(pairs) print u"-".join(tagger.best_path(u"* я везти сегодня **".split()))
projection=projection) code2model = dict() fold_models.append(code2model) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) td_wd_predictions_by_code[code] = to_flattened_binary_tags( td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags( vd_predictions) merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
test_tagged_corpus = [] for s, st in zip(test_cipher, test_plain): sample = list(zip(s, st)) test_tagged_corpus.append(sample) if args['laplace'] == True: Estimator = LaplaceProbDist print_estimator = 'Laplace' # just for printing else: Estimator = MLEProbDist print_estimator = 'MLE' # just for printing #/////////////// Train test MLE and la place etimsator ///////////////// # training HMM_tagger = HiddenMarkovModelTrainer(states=States, symbols=Symbols) HMM_tagger = HMM_tagger.train_supervised(train_tagged_corpus, estimator=Estimator) print(HMM_tagger) #/////////////////////// TEXT IMPROVEMENT ///////////////////////////// if args['lm'] == True: # get additional text # Text number 2554 English translation of Crime and Punishment bigrams = get_bigram(train_plain, url='http://www.gutenberg.org/files/2554/2554-0.txt') # conditional freq dist cfd = ConditionalFreqDist(bigrams) # Conditional probability distribution cpd = nltk.ConditionalProbDist(cfd, Estimator) Trainer = nltk.tag.hmm.HiddenMarkovModelTagger(states=States,
# # possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[ # random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)]] weights = [4, 15, 3, 0.5, 1, 0.1, 3, 2, 5, 0] kernel = possible_kernels[a1] degree = possible_degree[a2] epsilon = possible_epsilon[a3] C = possible_C[a4] n_components_LDA = 3 # HMM for feature 9 trainer = HiddenMarkovModelTrainer() st = 3000 train_data = treebank.tagged_sents()[:st] HMM = trainer.train_supervised(train_data) # Read training examples and training labels N_features = len(weights) frases1_train, frases2_train, Y_train = read_training_datasets() N_instances_train = len(Y_train) # Compute features (X_train) X_train = np.zeros((N_instances_train,1)) X_train = compute_feature1(frases1_train, frases2_train, X_train) X_train = compute_feature2(frases1_train, frases2_train, X_train) X_train = compute_feature3(frases1_train, frases2_train, X_train) X_train = compute_feature4(frases1_train, frases2_train, X_train) X_train = compute_feature5(frases1_train, frases2_train, X_train) X_train = compute_feature6(frases1_train, frases2_train, X_train)