Пример #1
0
def probAandBandC():
    bigram_list = []
    for sent in masc_tagged.tagged_sents(categories='journal'):
        p = [(None, None)]  # empty token/tag pair
        bigrams = zip(p + sent, sent + p)
        # makes bigrams out of postags in each sentence and appends them together
        for (a, b) in bigrams:
            history = a[1]
            current_tag = b[1]
            bigram_list.append((history, current_tag))
    # Creates a CFD from te bigram list
    cfd = ConditionalFreqDist(bigram_list)
    # creates CPD from CFD calculated above
    cpd = ConditionalProbDist(cfd, MLEProbDist)
    # creates CPD from CFD calculated above using laplace smoothing
    cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist)
    # creates CPD from CFD calculated above using Good Turing smootign
    cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
    #Prints every possible combination of POS tag bigram probabilities
    for x in cpd.conditions():
        for y in cpd.conditions():
            print("""For P %s ---> P_MLE(%s|%s) = %s\n
                              P_laplace(%s|%s) = %s\n
                              P_turing(%s|%s) = %s\n""" %
                  ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x),
                   y, x, cpd_turing[y].prob(x)))
Пример #2
0
def get_tagger():
    """
    Return a POS tagger; generate one if it doesn't exist

    :returns: nlkt.BigramTagger object trained with brown corpora
    :rtype: nltk.BigramTagger object
    """
    try:
        with open("tagger.pkl", "rb") as file:
            return pickle.load(file)
    except (FileNotFoundError, EOFError):
        sents = brown.tagged_sents() + masc_tagged.tagged_sents()
        t0 = nltk.DefaultTagger("NN")
        t1 = nltk.UnigramTagger(sents, backoff=t0)
        t2 = nltk.BigramTagger(sents, backoff=t1)
        with open("tagger.pkl", "wb") as file:
            pickle.dump(t2, file, -1)
        return t2
Пример #3
0
def get_counts():
    words = {}
    vb_transitions = {}
    
    for sent in masc_tagged.tagged_sents():
        for idx, (word, tag) in enumerate(sent):
            if words.get(tag):
                insert_word(words[tag], word)
            else:
                words[tag] = {}
                insert_word(words[tag], word)
                
            if tag == 'VB' and idx < (len(sent) - 1):
                next_tag = sent[idx + 1][1]
                if vb_transitions.get(next_tag):
                    vb_transitions[next_tag] += 1
                else:
                    vb_transitions[next_tag] = 1
                
    return words, vb_transitions
Пример #4
0
def probAandBandC():
    bigram_list = []
    for sent in masc_tagged.tagged_sents(categories='journal'):
        p = [(None, None)]                       # empty token/tag pair
        bigrams = zip(p+sent, sent+p)
        # makes bigrams out of postags in each sentence and appends them together
        for (a,b) in bigrams:
            history = a[1]
            current_tag = b[1]
            bigram_list.append((history, current_tag))
    # Creates a CFD from te bigram list
    cfd = ConditionalFreqDist(bigram_list)
    # creates CPD from CFD calculated above
    cpd = ConditionalProbDist(cfd, MLEProbDist)
    # creates CPD from CFD calculated above using laplace smoothing
    cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist)
    # creates CPD from CFD calculated above using Good Turing smootign
    cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
    #Prints every possible combination of POS tag bigram probabilities
    for x in cpd.conditions():
        for y in cpd.conditions():
            print("""For P %s ---> P_MLE(%s|%s) = %s\n
                              P_laplace(%s|%s) = %s\n
                              P_turing(%s|%s) = %s\n""" % ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x),y, x, cpd_turing[y].prob(x)))
Пример #5
0
def get_tagger():
    """
    Return a POS tagger; generate one if it doesn't exist

    Using a BigramTagger, set backoff to UnigramTagger, which again has DefaultTagger as backoff
    Assuming that the words that are not familiar to the tagger would mostly be medical terms,
    and that those medical terms are mostly nouns, we set the default POS to NN. So that we have
    a higher probability of getting it right.

    :returns: nlkt.BigramTagger object trained with brown corpora
    :rtype: nltk.BigramTagger
    """
    try:
        with open("tagger.pkl", "rb") as file:
            return pickle.load(file)
    except (FileNotFoundError, EOFError):
        sents = brown.tagged_sents() + masc_tagged.tagged_sents()
        t0 = nltk.DefaultTagger("NN")
        t1 = nltk.UnigramTagger(sents, backoff=t0)
        t2 = nltk.BigramTagger(sents, backoff=t1)
        t3 = nltk.TrigramTagger(sents, backoff=t2)
        with open("tagger.pkl", "wb") as file:
            pickle.dump(t3, file, -1)
        return t3
Пример #6
0
test_set = []
word_sents =[]
pos_sents = []
word_training_set = []
pos_training_set = []
word_test_set = []
pos_test_set = []
# Calculates training set len
training_len = int(.90 * len(masc_tagged.sents(categories='blog')))
# Calculates test set len
test_len = int(len(masc_tagged.sents(categories='blog')) - training_len)
counter = 0
#loops trough sentence in the text and makes test and training lists
# for word and pos sentences. Each sentence is a sublist in these lists
# so the sublists themselves are delimiters
for sent in masc_tagged.tagged_sents(categories = 'blog'):
    word_sents =[]
    pos_sents = []
    for word in sent:
        word_sents.append(word[0])
        pos_sents.append(word[1])
    if counter <= training_len:
        word_training_set.append(word_sents)
        pos_training_set.append(pos_sents)
    else:
        word_test_set.append(word_sents)
        pos_test_set.append(pos_sents)
    counter +=1

total_word_sents = word_training_set + word_test_set 
total_pos_sents = pos_training_set + pos_test_set
Пример #7
0
        list(zip('Hi I am dog'.split(), [None] * 4)),
        list(zip('Try using your models as LMs'.split(), [None] * 6)),
        list(zip('Submit your answers'.split(), [None] * 3)),
        list(zip('Is you are we you they us them porridge'.split(), [None] * 9)),
        list(zip('Live computer eat slightly manic bag'.split(), [None] * 6)),
        list(zip('I am outputting a rather probable sentence but this one is still quite long one'.split(), [None] * 15)),
        list(zip('The the the the'.split(), [None] * 4)),
    ]
    
    for sent in test_sents:
        print(sent, 'Probability: ', model.log_probability(sent))
        
def sample_model(model):
    print(model.random_sample(random, 15))

train = hmm.HiddenMarkovModelTagger

model = train.train(masc_tagged.tagged_sents())

with open('radio_planet_tokens.txt') as radio:
    lines = radio.readlines()
    lines = list(map(lambda x: x.rstrip('\n').split(), lines))
    u_model = train_unsupervised(masc_tagged.tagged_sents(), lines, 10)

models = [model, u_model]

for m in models:
    tag_sents(m)
    log_prob(m)
    sample_model(m)
    print('W-W' * 100)
Пример #8
0
test_set = []
word_sents = []
pos_sents = []
word_training_set = []
pos_training_set = []
word_test_set = []
pos_test_set = []
# Calculates training set len
training_len = int(.90 * len(masc_tagged.sents(categories='blog')))
# Calculates test set len
test_len = int(len(masc_tagged.sents(categories='blog')) - training_len)
counter = 0
#loops trough sentence in the text and makes test and training lists
# for word and pos sentences. Each sentence is a sublist in these lists
# so the sublists themselves are delimiters
for sent in masc_tagged.tagged_sents(categories='blog'):
    word_sents = []
    pos_sents = []
    for word in sent:
        word_sents.append(word[0])
        pos_sents.append(word[1])
    if counter <= training_len:
        word_training_set.append(word_sents)
        pos_training_set.append(pos_sents)
    else:
        word_test_set.append(word_sents)
        pos_test_set.append(pos_sents)
    counter += 1

total_word_sents = word_training_set + word_test_set
total_pos_sents = pos_training_set + pos_test_set
Пример #9
0
    print "3. masc_tagged"
    print "4. conll2000"
    exit(0)
else:
    index = int(sys.argv[1])
    tagset = int(sys.argv[2])
    if index == 1 and tagset == 1:
        sents = brown.tagged_sents()
    elif index == 1 and tagset == 2:
        sents = brown.tagged_sents(tagset='universal')
    elif index == 2 and tagset == 1:
        sents = treebank.tagged_sents()
    elif index == 2 and tagset == 2:
        sents = treebank.tagged_sents(tagset='universal')
    elif index == 3 and tagset == 1:
        sents = masc_tagged.tagged_sents()
    elif index == 3 and tagset == 2:
        sents = masc_tagged.tagged_sents(tagset='universal')
    elif index == 4 and tagset == 1:
        sents = conll2000.tagged_sents()
    elif index == 4 and tagset == 2:
        sents = conll2000.tagged_sents(tagset='universal')
    else:
        print "Usage: python HMM.py <corpus_index> <tagset_index>"
        print "Corpus:          Tagset: "
        print "1. brown           1. Default"
        print "2. treebank        2. Universal"
        print "3. masc_tagged"
        print "4. conll2000"
        exit(0)