def probAandBandC(): bigram_list = [] for sent in masc_tagged.tagged_sents(categories='journal'): p = [(None, None)] # empty token/tag pair bigrams = zip(p + sent, sent + p) # makes bigrams out of postags in each sentence and appends them together for (a, b) in bigrams: history = a[1] current_tag = b[1] bigram_list.append((history, current_tag)) # Creates a CFD from te bigram list cfd = ConditionalFreqDist(bigram_list) # creates CPD from CFD calculated above cpd = ConditionalProbDist(cfd, MLEProbDist) # creates CPD from CFD calculated above using laplace smoothing cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist) # creates CPD from CFD calculated above using Good Turing smootign cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) #Prints every possible combination of POS tag bigram probabilities for x in cpd.conditions(): for y in cpd.conditions(): print("""For P %s ---> P_MLE(%s|%s) = %s\n P_laplace(%s|%s) = %s\n P_turing(%s|%s) = %s\n""" % ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x), y, x, cpd_turing[y].prob(x)))
def get_tagger(): """ Return a POS tagger; generate one if it doesn't exist :returns: nlkt.BigramTagger object trained with brown corpora :rtype: nltk.BigramTagger object """ try: with open("tagger.pkl", "rb") as file: return pickle.load(file) except (FileNotFoundError, EOFError): sents = brown.tagged_sents() + masc_tagged.tagged_sents() t0 = nltk.DefaultTagger("NN") t1 = nltk.UnigramTagger(sents, backoff=t0) t2 = nltk.BigramTagger(sents, backoff=t1) with open("tagger.pkl", "wb") as file: pickle.dump(t2, file, -1) return t2
def get_counts(): words = {} vb_transitions = {} for sent in masc_tagged.tagged_sents(): for idx, (word, tag) in enumerate(sent): if words.get(tag): insert_word(words[tag], word) else: words[tag] = {} insert_word(words[tag], word) if tag == 'VB' and idx < (len(sent) - 1): next_tag = sent[idx + 1][1] if vb_transitions.get(next_tag): vb_transitions[next_tag] += 1 else: vb_transitions[next_tag] = 1 return words, vb_transitions
def probAandBandC(): bigram_list = [] for sent in masc_tagged.tagged_sents(categories='journal'): p = [(None, None)] # empty token/tag pair bigrams = zip(p+sent, sent+p) # makes bigrams out of postags in each sentence and appends them together for (a,b) in bigrams: history = a[1] current_tag = b[1] bigram_list.append((history, current_tag)) # Creates a CFD from te bigram list cfd = ConditionalFreqDist(bigram_list) # creates CPD from CFD calculated above cpd = ConditionalProbDist(cfd, MLEProbDist) # creates CPD from CFD calculated above using laplace smoothing cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist) # creates CPD from CFD calculated above using Good Turing smootign cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) #Prints every possible combination of POS tag bigram probabilities for x in cpd.conditions(): for y in cpd.conditions(): print("""For P %s ---> P_MLE(%s|%s) = %s\n P_laplace(%s|%s) = %s\n P_turing(%s|%s) = %s\n""" % ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x),y, x, cpd_turing[y].prob(x)))
def get_tagger(): """ Return a POS tagger; generate one if it doesn't exist Using a BigramTagger, set backoff to UnigramTagger, which again has DefaultTagger as backoff Assuming that the words that are not familiar to the tagger would mostly be medical terms, and that those medical terms are mostly nouns, we set the default POS to NN. So that we have a higher probability of getting it right. :returns: nlkt.BigramTagger object trained with brown corpora :rtype: nltk.BigramTagger """ try: with open("tagger.pkl", "rb") as file: return pickle.load(file) except (FileNotFoundError, EOFError): sents = brown.tagged_sents() + masc_tagged.tagged_sents() t0 = nltk.DefaultTagger("NN") t1 = nltk.UnigramTagger(sents, backoff=t0) t2 = nltk.BigramTagger(sents, backoff=t1) t3 = nltk.TrigramTagger(sents, backoff=t2) with open("tagger.pkl", "wb") as file: pickle.dump(t3, file, -1) return t3
test_set = [] word_sents =[] pos_sents = [] word_training_set = [] pos_training_set = [] word_test_set = [] pos_test_set = [] # Calculates training set len training_len = int(.90 * len(masc_tagged.sents(categories='blog'))) # Calculates test set len test_len = int(len(masc_tagged.sents(categories='blog')) - training_len) counter = 0 #loops trough sentence in the text and makes test and training lists # for word and pos sentences. Each sentence is a sublist in these lists # so the sublists themselves are delimiters for sent in masc_tagged.tagged_sents(categories = 'blog'): word_sents =[] pos_sents = [] for word in sent: word_sents.append(word[0]) pos_sents.append(word[1]) if counter <= training_len: word_training_set.append(word_sents) pos_training_set.append(pos_sents) else: word_test_set.append(word_sents) pos_test_set.append(pos_sents) counter +=1 total_word_sents = word_training_set + word_test_set total_pos_sents = pos_training_set + pos_test_set
list(zip('Hi I am dog'.split(), [None] * 4)), list(zip('Try using your models as LMs'.split(), [None] * 6)), list(zip('Submit your answers'.split(), [None] * 3)), list(zip('Is you are we you they us them porridge'.split(), [None] * 9)), list(zip('Live computer eat slightly manic bag'.split(), [None] * 6)), list(zip('I am outputting a rather probable sentence but this one is still quite long one'.split(), [None] * 15)), list(zip('The the the the'.split(), [None] * 4)), ] for sent in test_sents: print(sent, 'Probability: ', model.log_probability(sent)) def sample_model(model): print(model.random_sample(random, 15)) train = hmm.HiddenMarkovModelTagger model = train.train(masc_tagged.tagged_sents()) with open('radio_planet_tokens.txt') as radio: lines = radio.readlines() lines = list(map(lambda x: x.rstrip('\n').split(), lines)) u_model = train_unsupervised(masc_tagged.tagged_sents(), lines, 10) models = [model, u_model] for m in models: tag_sents(m) log_prob(m) sample_model(m) print('W-W' * 100)
test_set = [] word_sents = [] pos_sents = [] word_training_set = [] pos_training_set = [] word_test_set = [] pos_test_set = [] # Calculates training set len training_len = int(.90 * len(masc_tagged.sents(categories='blog'))) # Calculates test set len test_len = int(len(masc_tagged.sents(categories='blog')) - training_len) counter = 0 #loops trough sentence in the text and makes test and training lists # for word and pos sentences. Each sentence is a sublist in these lists # so the sublists themselves are delimiters for sent in masc_tagged.tagged_sents(categories='blog'): word_sents = [] pos_sents = [] for word in sent: word_sents.append(word[0]) pos_sents.append(word[1]) if counter <= training_len: word_training_set.append(word_sents) pos_training_set.append(pos_sents) else: word_test_set.append(word_sents) pos_test_set.append(pos_sents) counter += 1 total_word_sents = word_training_set + word_test_set total_pos_sents = pos_training_set + pos_test_set
print "3. masc_tagged" print "4. conll2000" exit(0) else: index = int(sys.argv[1]) tagset = int(sys.argv[2]) if index == 1 and tagset == 1: sents = brown.tagged_sents() elif index == 1 and tagset == 2: sents = brown.tagged_sents(tagset='universal') elif index == 2 and tagset == 1: sents = treebank.tagged_sents() elif index == 2 and tagset == 2: sents = treebank.tagged_sents(tagset='universal') elif index == 3 and tagset == 1: sents = masc_tagged.tagged_sents() elif index == 3 and tagset == 2: sents = masc_tagged.tagged_sents(tagset='universal') elif index == 4 and tagset == 1: sents = conll2000.tagged_sents() elif index == 4 and tagset == 2: sents = conll2000.tagged_sents(tagset='universal') else: print "Usage: python HMM.py <corpus_index> <tagset_index>" print "Corpus: Tagset: " print "1. brown 1. Default" print "2. treebank 2. Universal" print "3. masc_tagged" print "4. conll2000" exit(0)