Пример #1
0
def get_tagger(lang):
    if lang == "English":
        global eng_tagger
        if eng_tagger:
            return eng_tagger
        else:
            _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
            eng_tagger = load(_POS_TAGGER)
            return eng_tagger
    elif lang == "Spanish":
        global spa_tagger
        if spa_tagger:
            return spa_tagger
        else:
            training = cess_esp.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return spa_tagger
    else:
        global cat_tagger
        if cat_tagger:
            return cat_tagger
        else:
            training = cess_cat.tagged_sents()
            default_tagger = nltk.DefaultTagger('NN')
            unigram_tagger = nltk.UnigramTagger(training,
                                                backoff=default_tagger)
            bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
            cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
            return cat_tagger
Пример #2
0
def nltk_tagger(brown):
    tagged = []

    training = nltkbrown.tagged_sents(tagset = 'universal')

    #create Unigram, Bigram, Trigram taggers
    unigram_tagger = nltk.UnigramTagger(training)
    bigram_tagger = nltk.BigramTagger(training)
    trigram_tagger = nltk.TrigramTagger(training)

    default_tagger = nltk.DefaultTagger('NOUN')
    bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    # tag sentences    
    tagged_sentence = []
    for sentence in brown:
        tags = trigram_tagger.tag(sentence)
        tagged_sentence.append(tags)

    for sentence in tagged_sentence:
        sentence = sentence[2:-1]
        temp = []
        for tup in sentence:
            wordtag = tup[0] + '/' + tup[1]
            temp.append(wordtag)
        tagged.append(temp)

    return tagged
Пример #3
0
def exercise2():
    news_tagged_sents = brown.tagged_sents(categories='news')
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(news_tagged_sents, backoff=t0)
    t2 = nltk.BigramTagger(news_tagged_sents, backoff=t1)
    t3 = nltk.TrigramTagger(news_tagged_sents, backoff=t2)
    news_test_sents = t3.evaluate(news_tagged_sents)
    print(news_test_sents)

    print("Part a")
    lore_tagged_sents = brown.tagged_sents(categories='lore')

    lore_tagger = t3.evaluate(lore_tagged_sents)

    print("Compare DefaultTagger of lore and news:", lore_tagger,
          news_test_sents)

    print("Part b")
    lore_size = 199  # 200th sentence
    lore_train_sents = lore_tagged_sents[:lore_size]
    lore_test_sents = lore_tagged_sents[lore_size:]

    unigram_tagger = nltk.UnigramTagger(lore_tagged_sents)
    unigram_val = unigram_tagger.evaluate(lore_tagged_sents)

    bigram_tagger = nltk.BigramTagger(lore_train_sents)
    bigram_val = bigram_tagger.evaluate(lore_test_sents)

    trigram_tagger = nltk.BigramTagger(lore_train_sents)
    trigram_val = trigram_tagger.evaluate(lore_test_sents)
    print(t3.tag(brown.sents(categories='lore')[199]))
    # print(brown.sents(categories='lore')[199])
    print("Unigram", unigram_val, 'vs.Bigram', bigram_val, 'vs.Trigram',
          trigram_val)
Пример #4
0
def ex2():

    tagged_brown = brown.tagged_sents(categories='news')

    results_brown = splitting(tagged_brown)
    train_brown1 = results_brown[0]
    train_brown2 = results_brown[1]
    test_brown1 = results_brown[2]
    test_brown2 = results_brown[3]

    tagged_chat = nps_chat.tagged_posts()
    results_chat = splitting(tagged_chat)
    train_chat1 = results_chat[0]
    train_chat2 = results_chat[1]
    test_chat1 = results_chat[2]
    test_chat2 = results_chat[3]

    default_tagger = nltk.DefaultTagger('NN')
    default_tagger.tag(test_brown1)
    default_tagger.tag(test_brown2)
    default_tagger.tag(test_chat1)
    default_tagger.tag(test_chat2)

    print('Test for brown corpus 1 : {}'.format(
        default_tagger.evaluate(test_brown1)))
    print('Test for brown corpus 2 : {}'.format(
        default_tagger.evaluate(test_brown2)))
    print('Test for chat corpus 1 : {}'.format(
        default_tagger.evaluate(test_chat1)))
    print('Test for chat corpus 2 : {}'.format(
        default_tagger.evaluate(test_chat2)))

    t1 = nltk.UnigramTagger(train_brown1, backoff=default_tagger)
    print(t1.evaluate(test_brown1))
    t2 = nltk.BigramTagger(train_brown1, backoff=t1)
    print(t2.evaluate(test_brown1))
    t3 = nltk.TrigramTagger(train_brown1, backoff=t2)
    print('Accuracy test brown 1: ', t3.evaluate(test_brown1))

    t1 = nltk.UnigramTagger(train_brown2, backoff=default_tagger)
    print(t1.evaluate(test_brown2))
    t2 = nltk.BigramTagger(train_brown2, backoff=t1)
    print(t2.evaluate(test_brown2))
    t3 = nltk.TrigramTagger(train_brown2, backoff=t2)
    print('Accuracy test brown 2: ', t3.evaluate(test_brown2))

    t1 = nltk.UnigramTagger(train_chat1, backoff=default_tagger)
    print(t1.evaluate(test_chat1))
    t2 = nltk.BigramTagger(train_chat1, backoff=t1)
    print(t2.evaluate(test_chat1))
    t3 = nltk.TrigramTagger(train_chat1, backoff=t2)
    print('Accuracy test chat 1: ', t3.evaluate(test_chat1))

    t1 = nltk.UnigramTagger(train_chat2, backoff=default_tagger)
    print(t1.evaluate(test_chat2))
    t2 = nltk.BigramTagger(train_chat2, backoff=t1)
    print(t2.evaluate(test_chat2))
    t3 = nltk.TrigramTagger(train_chat2, backoff=t2)
    print('Accuracy test chat 2: ', t3.evaluate(test_chat2))
Пример #5
0
def bitagger_train(train_sents, backoff=False):
    if backoff == True:
        t0 = nltk.DefaultTagger('NN')
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
    else:
        t2 = nltk.BigramTagger(train_sents)
    return t2
def exercise2():
    news_tagged_sents = brown.tagged_sents(categories='news')
    #brown_sents = brown.sents(categories='news')
    size = int(len(news_tagged_sents) * 0.9)
    train_sents = news_tagged_sents[:size]
    test_sents = news_tagged_sents[size:]
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    print t3.evaluate(test_sents)

    print("Part a")
    lore_tagged_sents = brown.tagged_sents(categories='lore')
    lore_size = int(len(lore_tagged_sents) * 0.9)
    lore_train_sents = lore_tagged_sents[:lore_size]
    lore_test_sents = lore_tagged_sents[lore_size:]
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(lore_train_sents, backoff=t0)
    t2 = nltk.BigramTagger(lore_train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(lore_train_sents, backoff=t2)
    print "Compare DefaultTagger of lore and news:", t0.evaluate(
        lore_test_sents), t0.evaluate(test_sents)
    print "UnigramTagger val of lore", t1.evaluate(lore_test_sents)
    print "Compare the UnigramTagger from lore and news: ", t1.evaluate(
        lore_test_sents), t1.evaluate(test_sents)
    print "BigramTagger val of lore", t2.evaluate(lore_test_sents)
    print "Compare the BigramgramTagger from lore and news: ", t2.evaluate(
        lore_test_sents), t2.evaluate(test_sents)
    print "TrigramTagger val of lore", t3.evaluate(lore_test_sents)
    print "Compare the TrigramTagger from lore and news: ", t3.evaluate(
        lore_test_sents), t3.evaluate(test_sents)

    print("Part b")
    lore_size = 199  # 200th sentence
    lore_train_sents = lore_tagged_sents[:lore_size]
    lore_test_sents = lore_tagged_sents[lore_size:]

    unigram_tagger = nltk.UnigramTagger(lore_tagged_sents)
    unigram_val = unigram_tagger.evaluate(lore_tagged_sents)

    bigram_tagger = nltk.BigramTagger(lore_train_sents)
    bigram_val = bigram_tagger.evaluate(lore_test_sents)

    trigram_tagger = nltk.BigramTagger(lore_train_sents)
    trigram_val = trigram_tagger.evaluate(lore_test_sents)

    print(brown.sents(categories='lore')[199])
    print("Unigram", unigram_val, 'vs.Bigram', bigram_val, 'vs.Trigram',
          trigram_val)
Пример #7
0
def tagTexto(ws):
    size = int(len(ws) * 0.9)
    train_sents = ws[:size]
    test_sents = ws[size:]

    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)

    print('BigramTagger con backoff', t2.evaluate(test_sents))

    t3 = nltk.BigramTagger(train_sents)
    print('BigramTagger sin backoff', t3.evaluate(test_sents))

    return t2
Пример #8
0
def main():
    """ Main function. """
    # Regular expression used as a backoff tagger
    regex = nltk.RegexpTagger
    (
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
            (
                r'('+'|'.join(stopwords.words('spanish')) + ')$', 'STOP'
            ),
            (
                r'(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}',
                'URL'
            ),
            (r'[0-9]+/[0-9]+/[0-9]+', 'DATE'),
            (r'([^A-Za-z0-9])+', 'PUNCT'),
            (r'\xbf', 'Faa'),
            (r'\xa1', 'Fat'),
            (r'.*', 'N_N')  # weird tokens (default)
        ]
    )

    # Create training set from the Conll2002 Spanish corpus
    train_set = []
    for text in nltk.corpus.conll2002.tagged_sents('esp.train'):
        train_set.append([(word.lower(), tag) for word, tag in text])

    logging.info('Training Unigram Tagger...')
    unigram_tagger = nltk.UnigramTagger(train_set, backoff=regex)
    logging.info('Training Bigram Tagger...')
    tagger_da = nltk.BigramTagger(train_set, backoff=unigram_tagger)

    logging.info('Pickling Part of Speech Tagger...')
    pickle.dump(tagger_da, open("tmp/pos_tagger.p", "wb"))
Пример #9
0
def train_and_save(filename, train_set, num):
    outfile = open(filename, 'wb')
    t = None
    if num == 1:  #train a backoff
        t1 = nltk.UnigramTagger(train_set)
        t2 = nltk.BigramTagger(train_set, backoff=t1)
        model = {'everything': 'NN', 'max': 'NN'}
        t = nltk.UnigramTagger(model=model, backoff=t2)
    elif num == 2:
        t = nltk.BigramTagger(train_set)
    elif num == 3:
        t = nltk.TrigramTagger(train_set)
    else:
        return
    dump(t, outfile, -1)
    outfile.close()
Пример #10
0
def dump(config):
    """Loads word embeddngs an calculates neighbors.
    Args:
        config: an instance of TaggerConfiguration
    """

    tagger_dir = config.tagger_dir
    tagger_name = os.path.join(tagger_dir, "tagger.pkl")
    os.makedirs(tagger_dir, exist_ok=True)
    if not os.path.isfile(tagger_name):
        brown_tagged_sents = brown.tagged_sents(tagset='universal')
        size = int(len(brown_tagged_sents) * 0.9)
        train_sents = brown_tagged_sents[:size]
        test_sents = brown_tagged_sents[size:]
        t0 = nltk.DefaultTagger('X')
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        t3 = nltk.TrigramTagger(train_sents, backoff=t2)
        scores = [[t1.evaluate(test_sents), t1], [t2.evaluate(test_sents), t2],
                  [t3.evaluate(test_sents), t3]]
        best_score, best_tagger = max(scores, key=lambda x: x[0])
        print("Finished building POS tagger {0:.2f}%".format(best_score * 100))
        with open(tagger_name, 'wb') as f:
            pkl.dump(best_tagger, f)
    with open(tagger_name, 'rb') as f:
        return pkl.load(f)
    print("Finished saving %s and %s." % (ids_name, distances_name))
Пример #11
0
 def __init__(self, train_sents):
     """
     The constructor takes a training data set and trains the classifier
     """
     train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                   for sent in train_sents]
     self.tagger = nltk.BigramTagger(train_data)
Пример #12
0
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    # Hint: use the following line to format data to what NLTK expects for training
    training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ]
    #training = []
    #for i in xrange(len(brown_words)):
    #    temp_training = []
    #    for j in xrange(len(brown_words[i])):
    #        temp_training.append(tuple((unicode(brown_words[i][j]), unicode(brown_tags[i][j]))))
    #    training.append(temp_training)
    #for train in training:
    #    print type(train), type(train[0])
    default_tagger = nltk.DefaultTagger("NOUN")
    bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    # IMPLEMENT THE REST OF THE FUNCTION HERE
    tagged = []
    for words in brown_dev_words:        
        temp_tagged = []
        tagged_sentence = trigram_tagger.tag(words)
        #print tagged_sentence
        for word in tagged_sentence:
            temp_tagged.append(str(word[0]+'/'+str(word[1])))
        temp_sentence = " ".join(temp_tagged)
        temp_sentence = temp_sentence + "\r\n"
        tagged.append(temp_sentence)
    #for tag in tagged:
    #    print tag
    return tagged
def create_tagger():
    """Train a tagger from the Brown Corpus. This should not be called very
    often; only in the event that the tagger pickle wasn't found."""
    print("Building tagger...")
    train_sents = brown.tagged_sents()

    # These regexes were lifted from the NLTK book tagger chapter.
    t0 = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adjectives
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', 'NN')                    # nouns (default)
        ])
    print("got t0")

    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    print("got t1")

    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print("got t2")

    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    print("Built tagger!")
    return t3
Пример #14
0
    def posTagging(self, s):
        """
        对一个分段进行POS标记
        input: ['i','love','you']
        output: [('i', 'PRON'), ('love', 'VERB'), ('you', 'PRON')]
        """
        brown_tagged_sents = brown.tagged_sents(tagset='universal',
                                                categories='news')

        default_tagger = nltk.DefaultTagger('NN')

        month = [
            u'january', u'february', u'march', u'april', u'may', u'june',
            u'july', u'august', u'september', u'october', u'november',
            u'december'
        ]

        np_words = [w.lower() for w in names.words()] + month
        np_tags = dict((word, 'NP') for word in np_words)
        np_tagger = nltk.UnigramTagger(model=np_tags, backoff=default_tagger)

        brown_unigram_tagger = nltk.UnigramTagger(brown_tagged_sents,
                                                  backoff=np_tagger)
        brown_bigram_tagger = nltk.BigramTagger(brown_tagged_sents,
                                                backoff=brown_unigram_tagger)
        brown_trigram_tagger = nltk.TrigramTagger(brown_tagged_sents,
                                                  backoff=brown_bigram_tagger)

        patterns = [(r'\bi\b', 'PRON')]
        regexp_tagger = nltk.RegexpTagger(patterns,
                                          backoff=brown_trigram_tagger)

        result = regexp_tagger.tag(s)
        return self.encodeutf8(result)
Пример #15
0
def combined_tagging():
  train_sents, test_sents = split_dataset()
  t0 = nltk.DefaultTagger('NN')
  t1 = nltk.UnigramTagger(train_sents, backoff=t0)
  t2 = nltk.BigramTagger(train_sents, backoff=t1)
  t3 = nltk.TrigramTagger(train_sents, backoff=t2)
  print(t3.evaluate(test_sents))
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    # Hint: use the following line to format data to what NLTK expects for training
    training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ]

    # IMPLEMENT THE REST OF THE FUNCTION HERE
    tagged = []

    #John's edit starts here

    unigram_tagger = nltk.DefaultTagger("NOUN")
    bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    # for sentence in brown_dev_words:
    #     tri_tags = trigram_tagger.tag(sentence)

    tri_tags = trigram_tagger.tag_sents(brown_dev_words)

    for sentence in tri_tags:
        final_sentence_combos = []
        for phrase in sentence:
            final_sentence_combos.append(phrase[0] + '/' + phrase[1])

        tagged.append(' '.join(final_sentence_combos) + '\n')

    #return provided by professor
    return tagged
Пример #17
0
def question5():

    # Provide the output of your tagger from the previous question on the 200th sentence of the lore category
    # of the Brown Corpus (note that brown.sents(categories='lore')[199] produces the 200th sentence).

    # Would you tag this sentence in the same manner? Why?

    # Tagged sents for news category in Brown Corpus
    brown_news = brown.tagged_sents(categories='news')

    # Tagged sents for lore category in Brown Corpus
    brown_lore = brown.sents(categories='lore')

    # Brown news as train set
    train_sents = brown_news

    # Brown lore as test set for 200th sentence
    test_sents = brown_lore[199]

    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)

    # Report the brown_lore 200th sentence
    print("brown_lore: ", t3.tag(test_sents))
Пример #18
0
 def __init__(self,pathToPickle=None):
     
     """"
     ########
     BigramPOStagger:
     ########
     """
     if pathToPickle == None:
         #brown = nltk.corpus.brown.tagged_sents()
         #nounByDefault_tagger = nltk.DefaultTagger('NN')
         #unigram_tagger = nltk.UnigramTagger(brown,backoff=nounByDefault_tagger)
         #self.bigram_tagger = nltk.BigramTagger(brown,backoff=unigram_tagger)
         """
         NPS CHAT tagged words
         """
         chat_words = [nltk.corpus.nps_chat.tagged_words()]
         nounByDefault_tagger = nltk.DefaultTagger('NN')
         unigram_tagger = nltk.UnigramTagger(chat_words,backoff=nounByDefault_tagger)
         self.bigram_tagger = nltk.BigramTagger(chat_words,backoff=unigram_tagger)
         
         pickle.dump(self.bigram_tagger, open(pathToPickle,"wb"))
         
         
     else:
         self.bigram_tagger = pickle.load(open(pathToPickle))
Пример #19
0
def main():
    training_data = make_corpus(
        load_text('text1.txt'),
        load_text('text2.txt'),
    )
    test_data = load_text('text3.txt')

    #print words

    #print nltk.FreqDist(tags)
    #print nltk.FreqDist(tags).max()

    default_tagger = nltk.DefaultTagger('noun')

    #baseline_tagger = nltk.UnigramTagger(model=automatic_tags, backoff=default_tagger)

    unigram_tagger = nltk.UnigramTagger(training_data, backoff=default_tagger)
    bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger)
    #print unigram_tagger._context_to_tag

    hmm = HiddenMarkovModelTrainer().train(training_data)

    def run_tagger(t):
        test = t.tag(test_data.words)
        print test
        print t.evaluate(test_data.tagged_sents)
        #print nltk.ConfusionMatrix(test_data.tagged_words, test)

    run_tagger(hmm)
Пример #20
0
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    tagged = []

    # Thank you for this
    training = [
        zip(brown_words[i], brown_tags[i]) for i in xrange(len(brown_words))
    ]

    # train the nltk taggers
    default_tagger = nltk.DefaultTagger('NOUN')
    bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    for token_list in brown_dev_words:
        # first, tag the tokens
        tagged_tuples = trigram_tagger.tag(token_list)

        # now, format the tagged tuples into strings
        sent_output = []
        for tag_tuple in tagged_tuples:
            sent_output.append(tag_tuple[0] + "/" + tag_tuple[1])

        tagged.append(sent_output)

    return tagged
Пример #21
0
    def __preparar_tagger(self):

        nome_arquivo_tagger = './cache/postagger.pickle'

        if os.path.exists(nome_arquivo_tagger):
            logging.debug("Carregando o Pos-Tagger já treinado de " +
                          nome_arquivo_tagger)
            with open(nome_arquivo_tagger, 'rb') as arquivo:
                self.tagger = pickle.load(arquivo)

        else:
            logging.debug("Treinando o Pos-Tagger.")
            #tsents = floresta.tagged_sents()
            tsents = mac_morpho.tagged_sents()
            tsents = [[(w.lower(), self.__simplify_tag(t)) for (w, t) in sent]
                      for sent in tsents if sent]
            tagger0 = nltk.DefaultTagger('n')
            tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0)
            tagger2 = nltk.BigramTagger(tsents, backoff=tagger1)
            #tagger3 = nltk.PerceptronTagger(tsents)
            self.tagger = tagger2

            logging.debug("Gravando o Pos-Tagger treinado em " +
                          nome_arquivo_tagger)
            with open(nome_arquivo_tagger, 'wb') as arquivo:
                pickle.dump(self.tagger, arquivo)
Пример #22
0
def ngramTagger(train_sents, n=2, defaultTag='NN'):
    t0 = nltk.DefaultTagger(defaultTag)
    if (n <= 0):
        return t0
    elif (n == 1):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        return t1
    elif (n == 2):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        return t2
    else:
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        t3 = nltk.TrigramTagger(train_sents, backoff=t2)
        return t3
Пример #23
0
    def tagger(self, train_set, level):
        """
        Returns a tagger based on the level, with '0' corresponding to
        default tagger, '1' corresponding to a unigram tagger, '2'
        corresponding to a bigram tagger and '3' corresponding to a
        trigram tagger, with each of the previous levels as backoffs

        Arguments:
        ---------
            train_set (list):
                First 90% of the tagged sentences used for training
            level (int):
                Type of tagger to be returned - '0' corresponds to
        default tagger, '1' corresponds to a unigram tagger, '2'
        corresponds to a bigram tagger and '3' corresponds to a
        trigram tagger, with each of the previous levels as backoffs

        Returns:
        --------
            By default, t2 (nltk.BigramTagger)
                Uses `nltk.UnigramTagger` and 'NN' as backoff-taggers
        """
        t = []
        while len(t) <= level:
            t.append(nltk.DefaultTagger(self.default_tagger))
            t.append(nltk.UnigramTagger(train_set, backoff=t[0]))
            t.append(nltk.BigramTagger(train_set, backoff=t[1]))
            t.append(nltk.TrigramTagger(train_set, backoff=t[2]))
        return t[level]
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    training = []
    for brown_sentence, tag_sentence in zip(brown_words, brown_tags):
        words = brown_sentence.split(' ')
        tags = tag_sentence.split(' ')
        sentence_tags = []
        for word, tag in zip(words, tags):
            sentence_tags.append((word, tag))

        sentence_tags.pop(0)
        sentence_tags.pop(0)
        sentence_tags.pop()
        sentence_tags.pop()

        training.append(sentence_tags)

    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(training, backoff=t0)
    t2 = nltk.BigramTagger(training, backoff=t1)
    t3 = nltk.TrigramTagger(training, backoff=t2)

    # IMPLEMENT THE REST OF THE FUNCTION HERE
    tagged = []
    for sentence in brown_dev_words:
        tgd_stc = t3.tag(sentence)
        pairs = []
        for tup in tgd_stc:
            word, tg = tup
            joint = word + '/' + tg
            pairs.append(joint)

        joint = ' '.join(pairs)
        tagged.append(joint + '\n')
    return tagged
Пример #25
0
    def _train_tagger(self):
        training_sents = treebank.tagged_sents()
        patterns = [  # for regexp tagger
            (r'^[\.|\?|!]$', '.'), (r'^,$', ','), (r'^\'$', '\'\''),
            (r'^\"$', '\"'), (r'^\($', '('),
            (r'^\)$', ')'), (r'^[=|/]$', 'SYM'), (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'),
            (r'.*\'s$', 'POS'), (r'.*s$', 'NNS'),
            (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'),
            (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'^[0-9][0-9]*$', 'CD'),
            (r'^[0-9]([0-9]*[-|.|,|/][0-9]*)*$', 'CD'),
            (r'^([0-9]*\.[0-9]*)*$', 'CD'), (r'^[^a-zA-Z]*$', ':'),
            (r'[A-Z].*', 'NNP'), (r'.*', 'NN')
        ]

        default_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
        unigram_tagger = nltk.UnigramTagger(training_sents,
                                            backoff=regexp_tagger)
        bigram_tagger = nltk.BigramTagger(training_sents,
                                          backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(training_sents,
                                            backoff=bigram_tagger)

        self.final_tagger = trigram_tagger
Пример #26
0
    def __init__(self):
        """Initialization method of :class:`TopicExtractor` class.
        """

        # This is our fast Part of Speech tagger
        #############################################################################
        brown_train = brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                (r'(-|:|;)$', ':'),
                (r'\'*$', 'MD'),
                (r'(The|the|A|a|An|an)$', 'AT'),
                (r'.*able$', 'JJ'),
                (r'^[A-Z].*$', 'NNP'),
                (r'.*ness$', 'NN'),
                (r'.*ly$', 'RB'),
                (r'.*s$', 'NNS'),
                (r'.*ing$', 'VBG'),
                (r'.*ed$', 'VBD'),
                (r'.*', 'NN')
            ])
        unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
        self.bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
        #############################################################################

        # This is our semi-CFG; Extend it according to your own needs
        #############################################################################
        self.cfg = {}
        self.cfg["NNP+NNP"] = "NNP"
        self.cfg["NN+NN"] = "NNI"
        self.cfg["NNI+NN"] = "NNI"
        self.cfg["JJ+JJ"] = "JJ"
        self.cfg["JJ+NN"] = "NNI"
def exercise3():
    # Compare the given TrigramTagger from the previous question with a TrigramTagger where no backoff is provided.
    # Train this tagger on all of the sentences from the Brown corpus with the category news.
    # Then evaluate your tagger using "evaluate" function on  all of the sentences from the Brown corpus with the category lore.
    # Report the numbers. Which tagger performs better? Why?
    news_tagged_sents = brown.tagged_sents(categories='news')
    size = int(len(news_tagged_sents))
    train_sents = news_tagged_sents[:size]
    test_sents = news_tagged_sents[size:]
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents)
    t2 = nltk.BigramTagger(train_sents)
    t3 = nltk.TrigramTagger(train_sents)
    news_trigram_val = t3.evaluate(test_sents)
    print("trigram without backoff", news_trigram_val)
    print("Trigram with backoff ", t3.evaluate(test_sents))

    # category lore
    lore_tagged_sents = brown.tagged_sents(categories='lore')
    lore_test_sents = lore_tagged_sents[size:]
    lore_trigram_val = t3.evaluate(lore_test_sents)
    print("Brown corpus category lore value", lore_trigram_val)

    print "Category news tagger peforms better because it evaluates tags of the same category,"
    print "thus yielding more accurate results. It performs better if evaluate tags in the same category"
Пример #28
0
    def __init__(self):
        #nltk.download()
        self.type = "text"

        # Code taken from 'Natural Language Processing with Python' by Steven Bird. Pg. 203
        # Categorise training & test data
        print "Generating training & test data..."
        self.brown_tagged_sents = brown.tagged_sents(categories='news')

        # Use 90% to construct a model & 10% to test the model
        size = int(len(self.brown_tagged_sents) * 0.9)
        self.train_sents = self.brown_tagged_sents[:size]
        self.test_sents = self.brown_tagged_sents[size:]

        # Setup multiple backup taggers
        print "Creating taggers..."
        self.default_tagger = nltk.DefaultTagger('NN')
        self.uni_tagger = nltk.UnigramTagger(self.train_sents,
                                             backoff=self.default_tagger)
        self.bi_tagger = nltk.BigramTagger(self.train_sents,
                                           backoff=self.uni_tagger)
        self.tri_tagger = nltk.TrigramTagger(self.train_sents,
                                             backoff=self.bi_tagger)

        super(TextSystem, self).__init__(type)
Пример #29
0
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    # Hint: use the following line to format data to what NLTK expects for training
    #training = [ zip(brown_words[i],brown_tags[i]) for i in xrange(len(brown_words)) ]
    training = []
    for i in xrange(len(brown_words)):
        words = [unicode(x, 'utf-8') for x in brown_words[i]]
        tags = [unicode(x, 'utf-8') for x in brown_tags[i]]
        training.append(zip(words, tags))

    print(training[0])
    #input("continue...")

    # IMPLEMENT THE REST OF THE FUNCTION HERE
    print("\nIn NLTK tagger code")
    default_tagger = nltk.DefaultTagger('NOUN')
    bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    tagged = []
    for sentence in brown_dev_words:
        tagged_tuples = trigram_tagger.tag(sentence)
        tagged_sentence = ""
        for word_tag in tagged_tuples:
            tagged_sentence += word_tag[0] + "/" + word_tag[1] + " "
        tagged_sentence += "\n"
        tagged.append(tagged_sentence)
        Pd.printdot(1000)  #monitor progress

    return tagged
Пример #30
0
 def word_tagger(self):
     default_tagger = nltk.DefaultTagger('NN')
     unigram_tagger = nltk.UnigramTagger(self.training_sents,
                                         backoff=default_tagger)
     bigram_tagger = nltk.BigramTagger(self.training_sents,
                                       backoff=unigram_tagger)
     self.text = bigram_tagger.tag(self.text)