def main(train_data, test_data): print "Training" m = HiddenMarkovModelTagger.train(train_data) print "Predicting" predicted_labels = [] for i, sent in enumerate(test_data): if i % 500 == 0: print "%d / %d" %(i, len(test_data)) predicted_labels += [tag for _, tag in m.tag( [word for word, _ in sent] )] correct_labels = [tag for sent in test_data for _, tag in sent] # print predicted_labels # print correct_labels from sklearn.metrics import classification_report print classification_report(correct_labels, predicted_labels) correct_n = len([1 for p, c in zip(predicted_labels, correct_labels) if p == c]) print "Item accuracy:", float(correct_n) / len(correct_labels)
def compare_taggers(train_data_Brown, train_data_Universal,test_data_Brown,test_data_Universal): tagger_Brown = HiddenMarkovModelTagger.train(train_data_Brown) tagger_Universal = HiddenMarkovModelTagger.train(train_data_Universal) eval_Brown = tagger_Brown.evaluate(test_data_Brown) eval_Universal = tagger_Universal.evaluate(test_data_Universal) answer1 = "Brown and Universal are training the same data size. Considering the brown tagset is larger than the universal tagset, they both train the same data size. As a result, universal produces more well trained set than the brown tagset, eventhough the increase in the states. Universal tagset contains more transitions and tags per possible state which creates a better observation set compares to the brown set." answer2 = "..." return eval_Brown, eval_Universal, answer1, answer2
def __init__(self, mode, train_sents): if mode == TRIGRAM: self.tagger = UnigramTagger(train_sents) self.tagger = BigramTagger(train_sents, backoff=self.tagger) self.tagger = TrigramTagger(train_sents, backoff=self.tagger) elif HDM: self.tagger = HiddenMarkovModelTagger.train(train_sents)
def main(train_data, test_data): print "Training" m = HiddenMarkovModelTagger.train(train_data) print "Predicting" predicted_labels = [] for i, sent in enumerate(test_data): if i % 500 == 0: print "%d / %d" % (i, len(test_data)) predicted_labels += [ tag for _, tag in m.tag([word for word, _ in sent]) ] correct_labels = [tag for sent in test_data for _, tag in sent] # print predicted_labels # print correct_labels from sklearn.metrics import classification_report print classification_report(correct_labels, predicted_labels) correct_n = len( [1 for p, c in zip(predicted_labels, correct_labels) if p == c]) print "Item accuracy:", float(correct_n) / len(correct_labels)
def _dict_to_object(dic): from .storage import dict_to_object states = dic['states'] symbols = dic['symbols'] priors = dict_to_object(dic['priors']) outputs = dict_to_object(dic['outputs']) transitions = dict_to_object(dic['transitions']) return HiddenMarkovModelTagger(symbols, states, transitions, outputs, priors)
def build_manual(): seqs = [] seqs.insert(0,sen11) seqs.insert(0,sen10) seqs.insert(0,sen9) seqs.insert(0,sen8) seqs.insert(0,sen6) seqs.insert(0,sen5) seqs.insert(0,sen4) seqs.insert(0,sen3) seqs.insert(0,sen2) seqs.insert(0,sen1) result = Hmm.train(seqs) return result
def load(cls, filename): """ Loads and deserializes the pickle file (Diacritics restorer) saved on given path :param filename: load path :return: The loaded diacritics restorer object :rtype: HmmNgramRestorer """ with open(filename, 'rb') as file: dump = pickle.load(file) hmm = cls(dump["n"]) dump = dump["tagger"] hmm.tagger = HiddenMarkovModelTagger(dump["_symbols"], dump["_states"], dump["_transitions"], dump["_outputs"], dump["_priors"]) return hmm
def train(self, labeled_sequence): def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(_identity, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = HiddenMarkovModelTagger( hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=_identity, ) self.tagger = hmm
def hmm(train_path, test_path): training_sentences = list(gen_corpus(train_path)) test_sentences = list(gen_corpus(test_path)) start = perf_counter() hmm_model = HiddenMarkovModelTagger.train(list(convert_sents_to_zipped(training_sentences))) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) start = perf_counter() # Evaluation y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in hmm_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
def construct_hmm(japanese,english): """ INPUT: *Aligned* parallel arrays OUTPUT: An HMM """ print "[Start] Training HMM" # Coming in, we have two parallel arrays of arrays. # [ ['j1', 'j2'], ['j1', 'j2'] ] + [ ['e1', 'e2'], ['e1', 'e2'] ] # What we need is an array of combined tuples # [ [ [j1,e1],[j2,e2] ], [ [j1,e1],[j2,e2] ] ] training_data = [] for i in range(len(japanese)): sequence = [] j_word = japanese[i] e_word = english[i] for j in range(len(j_word)): sequence.append((e_word[j],j_word[j])) training_data.append(sequence) model = Hmm.train(training_data) print "[ End ] Training HMM" return model
def procesado_bigram(texto_entrada): return 0 def procesado_naive(texto_entrada): return 0 ############################################################################## #Entrenamiento de los tagger if path.exists('spanish_hmm.plk'): hmm_tagger = joblib.load('spanish_hmm.plk') else: #Entrenamos el Hidden tagger y lo guardamos en un fichero para sucesivas ocasiones hmm_tagger = HiddenMarkovModelTagger.train(cess_sents) with open('spanish_hmm.plk', 'wb') as pickle_file: dill.dump(hmm_tagger, pickle_file) #CAMBIAR ESTO -> PRIMERO EJECUTAR EL REGEX LUEGO EL RESTO EN ORDEN... #Menú principal print("Selección una Opción:") print("1.Entrenamiento RegexParser.") print("2.Test.") print("3.Salir.") opcion = input() if int(opcion) == 1: print("Entrenando RegexParser...") train_regex(corpus_ejemplo)
def main(): parser = argparse.ArgumentParser(description='Text decipher options') parser.add_argument('cipher_folder', help='cipher data folder') parser.add_argument('--laplace', '-laplace', action='store_true', default=False, help='Laplace Smoothing') parser.add_argument('--langmod', '-lm', action='store_true', default=False, help='Improved decoder') args = parser.parse_args() cipher_folder = args.cipher_folder laplace = args.laplace langmod = args.langmod number_of_supp_lines = 100 #the more lines the slower the code! train_data, test_data, train_plain = get_data(cipher_folder) preprocess_supp_data() supp_data = read_preprocessed_supp_data(number_of_supp_lines) for line in train_plain: #this is so later we have all the transitions in the same place supp_data.extend(list(line)) if laplace: smoothing = LaplaceProbDist else: smoothing = MLEProbDist trainer = hmm.HiddenMarkovModelTrainer() decoder = trainer.train_supervised(train_data, smoothing) #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder) #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually.... #code copied from the nltk train_supervised method #here, we are updating the transition data to include our supplemental data if langmod: states = decoder._states symbols = decoder._symbols outputs = decoder._outputs priors = decoder._priors starting = FreqDist() #declaring transitions = ConditionalFreqDist( ) #declaring, why we needed all the transitions in the same place for item in supp_data: for sequence in supp_data: lasts = None for state in sequence: if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 lasts = state if laplace: estimator = LaplaceProbDist else: estimator = lambda fdist, bins: MLEProbDist( fdist) #getting this straight from the source code N = len(states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) #conditionalPD is actually already defined by our previously trained model as outputs. #we don't have new ones! decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi) print(decoder.test(test_data)) for sent in test_data: print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
from nltk.tag.hmm import HiddenMarkovModelTagger, DictionaryConditionalProbDist, DictionaryProbDist def prob(**kwargs): return DictionaryProbDist(kwargs) def condprob(**kwargs): return DictionaryConditionalProbDist(kwargs) hmm = HiddenMarkovModelTagger( symbols = ["sound", "sounds", "nice", "dot"], states = ["ADJ", "N", "V", "END"], transitions = condprob( ADJ = prob(N = 0.4, V = 0.4, END = 0.2), N = prob(ADJ = 0.2, V = 0.7, END = 0.1), V = prob(N = 0.5, END = 0.5), ), outputs = condprob( ADJ = prob(sound = 0.3, nice = 0.7), N = prob(sound = 0.5, nice = 0.5), V = prob(sound = 0.8, sounds = 0.2), END = prob(dot = 1.0), ), priors = prob(ADJ = 0.3, N = 0.4, V = 0.3, END = 0) ) for words in ["nice sound dot", "sound sounds nice dot", "sound sound sound dot"]: tagged = hmm.tag(words.split()) print "Best tags:", tagged print "Forward probability:", hmm.probability([(w, None) for w in words.split()]) print "Sequence probability:", hmm.probability(tagged) print
# MODIFY: Comment/uncomment to modify features # first_letters_counter = Counter([transform_states(list(n),transforms=TRANSFORM_METHOD)[0] for n in names]) first_letters_counter = Counter([transform_states(list(n),transforms=TRANSFORM_METHOD,**{'ngram_length':NGRAM_LENGTH})[0] for n in names]) first_letters_total = sum([first_letters_counter[l] for l in first_letters_counter.keys()]) for letter in first_letters_counter.keys(): priors[letter] = first_letters_counter[letter]/float(first_letters_total) print priors print states priors = DictionaryProbDist(priors) print "TRAINING" tagger = HiddenMarkovModelTagger(symbols,states,transitions,outputs,priors) observations = None with open('data/input-data/leakedBits.txt','r') as inf: observations = inf.read().split('\n') labels = None with open('data/input-data/names.txt','r') as inf: labels = inf.read().split('\n')
# transition prob for row in X_train: lasts = None for ch in list(row): if(lasts is not None): transitional[lasts][ch] += 1 lasts = ch # emission prob for row in sequences: for pair in row: emissional[pair[1]][pair[0]] += 1 if(improved_laplace): print("################## Laplace ####################### \n") estimator= nltk.probability.LaplaceProbDist else: estimator = lambda fdist, bins: MLEProbDist(fdist) N = len(symbols) PI = estimator(Pi, N) A = ConditionalProbDist(transitional, estimator, N) B = ConditionalProbDist(emissional, estimator ,N) tagger = HiddenMarkovModelTagger(states, symbols, A, B, PI) print("\n ################## C{} Decryption Results #######################".format(int(i)) ) for row in test_cipher: print(tagger.best_path(row)) print("\n ################## C{} Accuracy Results #######################". format(int(i)) ) print(tagger.test(tester))
from celery.signals import celeryd_init from multiprocessing import Pool DIFF_THRESHOLD = 0.75 #twitter_stream = None app = Celery('tasks', broker='redis://localhost:6379/0') conn = psycopg2.connect("dbname=%s user=%s password=%s" % (postgres_db, postgres_user, postgres_pass)) cur = conn.cursor() # redis = redis.StrictRedis(host='localhost', port=6379, db=0) sents = conll2002.tagged_sents() hmm_tagger = HiddenMarkovModelTagger.train(sents) print 'Tagger ready' def analyze(text, track_list): tokens = word_tokenize(text) tags = hmm_tagger.tag(tokens) for tag in tags: if tag[0] in track_list: if tag[1].startswith('N') and len(tag[1]) <= 2: print text print tag return True break return False