Python tagged_sents 예제들, nltk.corpus.masc_tagged.tagged_sents Python 예제들

예제 #1

0

파일 보기

파일: problem_2.py 프로젝트: JakeBrawer/misc

def probAandBandC():
    bigram_list = []
    for sent in masc_tagged.tagged_sents(categories='journal'):
        p = [(None, None)]  # empty token/tag pair
        bigrams = zip(p + sent, sent + p)
        # makes bigrams out of postags in each sentence and appends them together
        for (a, b) in bigrams:
            history = a[1]
            current_tag = b[1]
            bigram_list.append((history, current_tag))
    # Creates a CFD from te bigram list
    cfd = ConditionalFreqDist(bigram_list)
    # creates CPD from CFD calculated above
    cpd = ConditionalProbDist(cfd, MLEProbDist)
    # creates CPD from CFD calculated above using laplace smoothing
    cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist)
    # creates CPD from CFD calculated above using Good Turing smootign
    cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
    #Prints every possible combination of POS tag bigram probabilities
    for x in cpd.conditions():
        for y in cpd.conditions():
            print("""For P %s ---> P_MLE(%s|%s) = %s\n
                              P_laplace(%s|%s) = %s\n
                              P_turing(%s|%s) = %s\n""" %
                  ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x),
                   y, x, cpd_turing[y].prob(x)))

예제 #2

0

파일 보기

파일: CS372_HW2_code_20190046.py 프로젝트: pacokwon/cs372-nlp

def get_tagger():
    """
    Return a POS tagger; generate one if it doesn't exist

    :returns: nlkt.BigramTagger object trained with brown corpora
    :rtype: nltk.BigramTagger object
    """
    try:
        with open("tagger.pkl", "rb") as file:
            return pickle.load(file)
    except (FileNotFoundError, EOFError):
        sents = brown.tagged_sents() + masc_tagged.tagged_sents()
        t0 = nltk.DefaultTagger("NN")
        t1 = nltk.UnigramTagger(sents, backoff=t0)
        t2 = nltk.BigramTagger(sents, backoff=t1)
        with open("tagger.pkl", "wb") as file:
            pickle.dump(t2, file, -1)
        return t2

예제 #3

0

파일 보기

파일: 3.py 프로젝트: Djiffit/nlp-mini

def get_counts():
    words = {}
    vb_transitions = {}
    
    for sent in masc_tagged.tagged_sents():
        for idx, (word, tag) in enumerate(sent):
            if words.get(tag):
                insert_word(words[tag], word)
            else:
                words[tag] = {}
                insert_word(words[tag], word)
                
            if tag == 'VB' and idx < (len(sent) - 1):
                next_tag = sent[idx + 1][1]
                if vb_transitions.get(next_tag):
                    vb_transitions[next_tag] += 1
                else:
                    vb_transitions[next_tag] = 1
                
    return words, vb_transitions

예제 #4

0

파일 보기

파일: problem_2.py 프로젝트: JakeBrawer/org

def probAandBandC():
    bigram_list = []
    for sent in masc_tagged.tagged_sents(categories='journal'):
        p = [(None, None)]                       # empty token/tag pair
        bigrams = zip(p+sent, sent+p)
        # makes bigrams out of postags in each sentence and appends them together
        for (a,b) in bigrams:
            history = a[1]
            current_tag = b[1]
            bigram_list.append((history, current_tag))
    # Creates a CFD from te bigram list
    cfd = ConditionalFreqDist(bigram_list)
    # creates CPD from CFD calculated above
    cpd = ConditionalProbDist(cfd, MLEProbDist)
    # creates CPD from CFD calculated above using laplace smoothing
    cpd_laplace = ConditionalProbDist(cfd, LaplaceProbDist)
    # creates CPD from CFD calculated above using Good Turing smootign
    cpd_turing = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
    #Prints every possible combination of POS tag bigram probabilities
    for x in cpd.conditions():
        for y in cpd.conditions():
            print("""For P %s ---> P_MLE(%s|%s) = %s\n
                              P_laplace(%s|%s) = %s\n
                              P_turing(%s|%s) = %s\n""" % ((x, y), y, x, cpd[y].prob(x), y, x, cpd_laplace[y].prob(x),y, x, cpd_turing[y].prob(x)))

예제 #5

0

파일 보기

def get_tagger():
    """
    Return a POS tagger; generate one if it doesn't exist

    Using a BigramTagger, set backoff to UnigramTagger, which again has DefaultTagger as backoff
    Assuming that the words that are not familiar to the tagger would mostly be medical terms,
    and that those medical terms are mostly nouns, we set the default POS to NN. So that we have
    a higher probability of getting it right.

    :returns: nlkt.BigramTagger object trained with brown corpora
    :rtype: nltk.BigramTagger
    """
    try:
        with open("tagger.pkl", "rb") as file:
            return pickle.load(file)
    except (FileNotFoundError, EOFError):
        sents = brown.tagged_sents() + masc_tagged.tagged_sents()
        t0 = nltk.DefaultTagger("NN")
        t1 = nltk.UnigramTagger(sents, backoff=t0)
        t2 = nltk.BigramTagger(sents, backoff=t1)
        t3 = nltk.TrigramTagger(sents, backoff=t2)
        with open("tagger.pkl", "wb") as file:
            pickle.dump(t3, file, -1)
        return t3

예제 #6

0

파일 보기

파일: problem_3.py 프로젝트: JakeBrawer/org

test_set = []
word_sents =[]
pos_sents = []
word_training_set = []
pos_training_set = []
word_test_set = []
pos_test_set = []
# Calculates training set len
training_len = int(.90 * len(masc_tagged.sents(categories='blog')))
# Calculates test set len
test_len = int(len(masc_tagged.sents(categories='blog')) - training_len)
counter = 0
#loops trough sentence in the text and makes test and training lists
# for word and pos sentences. Each sentence is a sublist in these lists
# so the sublists themselves are delimiters
for sent in masc_tagged.tagged_sents(categories = 'blog'):
    word_sents =[]
    pos_sents = []
    for word in sent:
        word_sents.append(word[0])
        pos_sents.append(word[1])
    if counter <= training_len:
        word_training_set.append(word_sents)
        pos_training_set.append(pos_sents)
    else:
        word_test_set.append(word_sents)
        pos_test_set.append(pos_sents)
    counter +=1

total_word_sents = word_training_set + word_test_set 
total_pos_sents = pos_training_set + pos_test_set

예제 #7

0

파일 보기

파일: 3.py 프로젝트: Djiffit/nlp-mini

        list(zip('Hi I am dog'.split(), [None] * 4)),
        list(zip('Try using your models as LMs'.split(), [None] * 6)),
        list(zip('Submit your answers'.split(), [None] * 3)),
        list(zip('Is you are we you they us them porridge'.split(), [None] * 9)),
        list(zip('Live computer eat slightly manic bag'.split(), [None] * 6)),
        list(zip('I am outputting a rather probable sentence but this one is still quite long one'.split(), [None] * 15)),
        list(zip('The the the the'.split(), [None] * 4)),
    ]
    
    for sent in test_sents:
        print(sent, 'Probability: ', model.log_probability(sent))
        
def sample_model(model):
    print(model.random_sample(random, 15))

train = hmm.HiddenMarkovModelTagger

model = train.train(masc_tagged.tagged_sents())

with open('radio_planet_tokens.txt') as radio:
    lines = radio.readlines()
    lines = list(map(lambda x: x.rstrip('\n').split(), lines))
    u_model = train_unsupervised(masc_tagged.tagged_sents(), lines, 10)

models = [model, u_model]

for m in models:
    tag_sents(m)
    log_prob(m)
    sample_model(m)
    print('W-W' * 100)

예제 #8

0

파일 보기

파일: problem_3.py 프로젝트: JakeBrawer/misc

test_set = []
word_sents = []
pos_sents = []
word_training_set = []
pos_training_set = []
word_test_set = []
pos_test_set = []
# Calculates training set len
training_len = int(.90 * len(masc_tagged.sents(categories='blog')))
# Calculates test set len
test_len = int(len(masc_tagged.sents(categories='blog')) - training_len)
counter = 0
#loops trough sentence in the text and makes test and training lists
# for word and pos sentences. Each sentence is a sublist in these lists
# so the sublists themselves are delimiters
for sent in masc_tagged.tagged_sents(categories='blog'):
    word_sents = []
    pos_sents = []
    for word in sent:
        word_sents.append(word[0])
        pos_sents.append(word[1])
    if counter <= training_len:
        word_training_set.append(word_sents)
        pos_training_set.append(pos_sents)
    else:
        word_test_set.append(word_sents)
        pos_test_set.append(pos_sents)
    counter += 1

total_word_sents = word_training_set + word_test_set
total_pos_sents = pos_training_set + pos_test_set

예제 #9

0

파일 보기

파일: HMM.py 프로젝트: RyanLBWoods/Bin_HMM

    print "3. masc_tagged"
    print "4. conll2000"
    exit(0)
else:
    index = int(sys.argv[1])
    tagset = int(sys.argv[2])
    if index == 1 and tagset == 1:
        sents = brown.tagged_sents()
    elif index == 1 and tagset == 2:
        sents = brown.tagged_sents(tagset='universal')
    elif index == 2 and tagset == 1:
        sents = treebank.tagged_sents()
    elif index == 2 and tagset == 2:
        sents = treebank.tagged_sents(tagset='universal')
    elif index == 3 and tagset == 1:
        sents = masc_tagged.tagged_sents()
    elif index == 3 and tagset == 2:
        sents = masc_tagged.tagged_sents(tagset='universal')
    elif index == 4 and tagset == 1:
        sents = conll2000.tagged_sents()
    elif index == 4 and tagset == 2:
        sents = conll2000.tagged_sents(tagset='universal')
    else:
        print "Usage: python HMM.py <corpus_index> <tagset_index>"
        print "Corpus:          Tagset: "
        print "1. brown           1. Default"
        print "2. treebank        2. Universal"
        print "3. masc_tagged"
        print "4. conll2000"
        exit(0)