def _train(cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, transform=_identity, estimator=None, **kwargs): if estimator is None: def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(transform, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=transform) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) if unlabeled_sequence: max_iterations = kwargs.get('max_iterations', 5) hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm, max_iterations=max_iterations) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) return hmm
def train_unsupervised(labeled_sents, unlabeled_sents, max_iterations=3): symbols = unique_list(word for sent in labeled_sents for word, tag in sent) # Extend symbols with those in the unlabelled set symbols = unique_list(symbols + unique_list(word for sent in unlabeled_sents for word in sent)) tag_set = unique_list(tag for sent in labeled_sents for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) print("Supervised training for initialization ({} sentences)".format( len(labeled_sents))) hmm = trainer.train_supervised( labeled_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) # The unlabeled sentences are expected to have tags, which are ignored unlabeled_sents = [[(word, None) for word in sent] for sent in unlabeled_sents] print( "Unsupervised training ({} sentences) for up to {} iterations".format( len(unlabeled_sents), max_iterations)) hmm = trainer.train_unsupervised(unlabeled_sents, model=hmm, max_iterations=max_iterations, verbose=True) return hmm
def __init__(self, symbols, states, transitions, outputs, priors, transform=_identity): self._symbols = unique_list(symbols) self._states = unique_list(states) self._transitions = transitions self._outputs = outputs self._priors = priors self._cache = None self._transform = transform
def hmmFactory(trainType, string): """ Creates hmm by it's type and the given corpus. :trainType type: str :string type: str :rtype: HmmWrapper """ if trainType == 'super': # supervised logging.info('Supervised') logging.info('Filtering corpus...') corpus = tag(string) states = unique_list(tag for sent in corpus for (_,tag) in sent) symbols = unique_list(word for sent in corpus for (word, _) in sent) logging.info('Training hmm...') trainer = HmmWrapper(states, symbols) trainer.trainSupervised(corpus) return trainer elif trainType == 'unsuper': # unsupervised logging.info('Unsupervised') logging.info('Filtering corpus') corpus = tagEmpty(string) states = range(5) symbols = unique_list(word for sent in corpus for (word, _) in sent) logging.info('Training hmm...') trainer = HmmWrapper(states, symbols) trainer.trainUnsupervised(corpus) return trainer else: # chunked supervised logging.info('Chunked supervised') logging.info('Training chunk parser...') chunker = ChunkWrapper() logging.info('Chunking corpus...') corpus = tagChunk(string, chunker) states = unique_list(tag for sent in corpus for (_, tag) in sent) symbols = unique_list(word for sent in corpus for (word, _) in sent) logging.info('Training hmm...') trainer = HmmWrapper(states, symbols) trainer.trainSupervised(corpus) return trainer
def index(request): if request.method == "POST": if request.POST.get("tokens"): with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle: corpus = pickle.load(handle) tokens = ast.literal_eval(request.POST.get("tokens")) tagged = [] i = 1 for item in tokens: tagged.append((item,request.POST.get("token_"+str(i)))) i += 1 if tagged not in corpus: corpus.append(tagged) with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle: pickle.dump(corpus, handle) tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) symbols = unique_list(word for sent in corpus for (word,tag) in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist) with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle: pickle.dump(hmm, handle) return render(request, 'tagger/index.html', {'corpus': corpus}) else: if request.POST.get("random") == 'true': address = get_random_address() if not address: return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'}) else: address = request.POST.get("address") tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', ) if tokens: pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb') hmm = pickle.load(pkl_file) pkl_file.close() tagged = hmm.tag(tokens) tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb') reader = codecs.getreader("utf-8") tags = json.load(reader(tags_file)) tags_file.close() return render(request, 'tagger/index.html', {'address': address, 'tokens': tokens, 'tagged': tagged, 'tags': sorted(tags.items(), key=operator.itemgetter(1)) }) return render(request, 'tagger/index.html', {})
def bigramTags(a): #create lit of all tags tags = [x[1] for x in a] #create list of tag bigrams btags = [(tags[i], tags[i + 1]) for i in range(len(tags) - 1)] #create frequency distribution of bigram tags btagsf = FreqDist(btags) #create list of unique bigram tags btagscombo = [(x, y) for x in unique_list(tags) for y in unique_list(tags)] out = [] #loop through unique bigram tags for i in range(len(btagscombo)): #add bigran tag with frequency probability to list out.append((btagscombo[i], btagsf.freq(btagscombo[i]))) return out
def unigramtrainingset(a): # create frequency distribution of word, tag pairs in the training set fd = FreqDist(a) # seperate words from tags x = [y[0] for y in a] # create frequency distribution of words in the training set fd2 = FreqDist(x) # create list of unique words words = unique_list([x[0] for x in fd]) # create list of unique tags (all possible tags) tags = [ 'NOUN', 'ADP', 'ADV', 'NUM', 'VERB', '.', 'PRON', 'DET', 'ADJ', 'PRT', 'CONJ' ] # initialise output list out = [] # loop through each unique word for word in words: # reinitialise tagso list tagso = [] # store frequency of current word denom = fd2.freq(word) # loop through each tag for tag in tags: # compute probability of current tag being paired with current word prob = fd.freq((word, tag)) / denom # create list of tag, probability pairs tagso.append((tag, prob)) # append word, tag-probabilities to out list out.append((word, tagso)) return out
def train_hmm_word(sent, train_data): symbols = unique_list(word for sent in train_data for word, tag in sent) # List of tags: gives back ["B", "I", "O"] tag_set = unique_list(tag for sent in train_data for word, tag in sent) trainer = hmm.HiddenMarkovModelTrainer(tag_set) tagger = trainer.train_supervised(train_data) sent_lst = sent.split() i = 0 test_tag = [] while (i < len(sent_lst)): end = min(i+5, len(sent_lst)) test_tag += tagger.tag(sent_lst[i:end]) i += 5 # print(test_tag) return test_tag
def corpus(request): with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle: corpus = pickle.load(handle) if request.method == "POST": #import ipdb; ipdb.set_trace() if request.POST.get("tokens") and request.POST.get("true_index"): true_index = int(request.POST.get("true_index")) tokens = ast.literal_eval(request.POST.get("tokens")) tagged = [] i = 1 for item in tokens: tagged.append((item,request.POST.get("token_"+str(i)))) i += 1 del(corpus[true_index]) corpus.append(tagged) with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle: pickle.dump(corpus, handle) tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) symbols = unique_list(word for sent in corpus for (word,tag) in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist) with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle: pickle.dump(hmm, handle) return render(request, 'tagger/index.html', {'corpus': corpus}) elif request.POST.get("index"): tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb') reader = codecs.getreader("utf-8") tags = json.load(reader(tags_file)) true_index = len(corpus) - int(request.POST.get("index")) corpus_item = corpus[true_index] return render(request, 'tagger/corpus.html', { 'tokens' : [t[0] for t in corpus_item], 'corpus_item': corpus_item, 'true_index': true_index, 'tags': sorted(tags.items(), key=operator.itemgetter(1))}) else: return render(request, 'tagger/corpus.html', {'corpus': corpus})
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ def lidstone_est(f): return nltk.probability.LidstoneProbDist(f, 0.01, f.B() + 1) # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [(tag, word.lower()) for (word, tag) in list(itertools.chain.from_iterable(train_data))] emission_FD = nltk.probability.ConditionalFreqDist(data) self.emission_PD = nltk.probability.ConditionalProbDist( emission_FD, lidstone_est) self.states = unique_list(tag for sent in train_data for (word, tag) in sent) return self.emission_PD, self.states
import nltk cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] print(len(cor)) from nltk.util import unique_list tag_set = unique_list(tag for sent in cor for (word, tag) in sent) print(len(tag_set)) symbols = unique_list(word for sent in cor for (word, tag) in sent) print(len(symbols)) print(len(tag_set)) symbols = unique_list(word for sent in cor for (word, tag) in sent) print(len(symbols)) trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) print(tag_set) print(symbols) train_corpus = [] test_corpus = [] for i in range(len(cor)): if i % 10: train_corpus += [cor[i]] else: test_corpus += [cor[i]] print(len(train_corpus)) print(len(test_corpus))
def test(request): if request.method == "POST": session_id = request.session['session_id'] order = request.session['order'] from_corpus = request.session['from_corpus'] from_raw = request.session['from_raw'] tagged_valid = True i = 1 for item in from_corpus: if order == 1: tag = request.POST.get("token_first_"+str(i)) else: tag = request.POST.get("token_second_"+str(i)) if tag != from_corpus[i-1][1]: tagged_valid = False i += 1 if tagged_valid: with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle: corpus = pickle.load(handle) #import ipdb; ipdb.set_trace(); new_tagged = [] i = 1 for item in from_raw: if order == 1: new_tagged.append((item[0],request.POST.get("token_second_"+str(i)))) else: new_tagged.append((item[0],request.POST.get("token_first_"+str(i)))) i += 1 if new_tagged not in corpus: corpus.append(new_tagged) with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle: pickle.dump(corpus, handle) tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) symbols = unique_list(word for sent in corpus for (word,tag) in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist) with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle: pickle.dump(hmm, handle) return render(request, 'tagger/index.html', {'corpus': corpus}) request.session['order'] = order = int(random.getrandbits(1)) with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle: corpus = pickle.load(handle) from_corpus = corpus.pop(random.randrange(len(corpus))) from_corpus_str = ' '.join([t[0] for t in from_corpus]) from_corpus_str = from_corpus_str.replace(' , ',', ') from_raw_str = get_random_address() if not from_raw_str: from_raw_str = corpus.pop(random.randrange(len(corpus))) pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb') hmm = pickle.load(pkl_file) pkl_file.close() tokens = regexp_tokenize(from_raw_str, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', ) from_raw = hmm.tag(tokens) tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb') reader = codecs.getreader("utf-8") tags = json.load(reader(tags_file)) tags_file.close() session_id = uuid.uuid1() request.session['session_id'] = str(session_id) request.session['from_corpus'] = from_corpus request.session['from_raw'] = from_raw from_corpus_distorted = distort(from_corpus) data = [from_corpus_distorted, from_corpus_str, from_raw, from_raw_str] return render(request, 'tagger/test.html', { 'data': data, 'session_id': session_id, 'order': order, 'tags': sorted(tags.items(), key=operator.itemgetter(1)) })
import nltk cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] print(len(cor)) from nltk.util import unique_list tag_set = unique_list(tag for sent in cor for (word,tag) in sent) print(len(tag_set)) symbols = unique_list(word for sent in cor for (word,tag) in sent) print(len(symbols)) print(len(tag_set)) symbols = unique_list(word for sent in cor for (word,tag) in sent) print(len(symbols)) trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) train_corpus = [] test_corpus = [] for i in range(len(cor)): if i % 10: train_corpus+=[cor[i]] else: test_corpus+=[cor[i]] print(len(train_corpus)) print(len(test_corpus))