Exemplo n.º 1
0
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(
        model_type,
        time.time() - now, path))
Exemplo n.º 2
0
    def train(self):

        self.re_tagger = nltk.RegexpTagger(self.patterns)
        self.bi_tagger = BigramTagger(brown.tagged_sents(),
                                      backoff=self.re_tagger)
        self.tri_tagger = TrigramTagger(brown.tagged_sents(),
                                        backoff=self.bi_tagger)
Exemplo n.º 3
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
Exemplo n.º 4
0
    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
                (r"(The|the|A|a|An|an)$", "AT"),  # articles
                (r".*able$", "JJ"),  # adjectives
                (r".*ness$", "NN"),  # nouns formed from adjectives
                (r".*ly$", "RB"),  # adverbs
                (r".*s$", "NNS"),  # plural nouns
                (r".*ing$", "VBG"),  # gerunds
                (r".*ed$", "VBD"),  # past tense verbs
                (r".*", "NN"),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories="news")
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
            backoff=trigram_tagger,
        )

        return main_tagger
Exemplo n.º 5
0
    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN'),  # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger
Exemplo n.º 6
0
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
Exemplo n.º 7
0
def train_tagger(language, model_type, feature, train_sents):
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
    elif model_type == 'crf':
        tagger = CRFTagger()
        tagger.train(train_sents,
                     'taggers/{0}/{1}/crf.pickle'.format(language, feature))
    elif model_type == 'perceptron':
        tagger = PerceptronTagger(load=False)
        tagger.train(train_sents)

    return tagger
Exemplo n.º 8
0
def ngram_tagger(tagged_sents):
    patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e
        (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'),
                (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'),
                (r'.*', 'MORA_HAUPT')]  # default
    regex_tagger = nltk.RegexpTagger(patterns)

    tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger)
    # cutoff = 3, if necessary
    tagger2 = BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = TrigramTagger(tagged_sents, backoff=tagger2)

    return tagger3
    def test_ngram_taggers(self):
        unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
        bitagger = BigramTagger(self.corpus, backoff=unitagger)
        tritagger = TrigramTagger(self.corpus, backoff=bitagger)
        ntagger = NgramTagger(4, self.corpus, backoff=tritagger)

        encoded = self.encoder.encode(ntagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(repr(ntagger), repr(decoded))
        self.assertEqual(repr(tritagger), repr(decoded.backoff))
        self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
        self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
        self.assertEqual(repr(self.default_tagger), 
                         repr(decoded.backoff.backoff.backoff.backoff))
Exemplo n.º 10
0
def train_brill_tagger(tagged_sents):

    # The brill tagger module in NLTK.
    Template._cleartemplates()
    templates = brill24()  # or fntbl37
    # default_tagger = nltk.DefaultTagger('MORA_HAUPT')
    patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e
        (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'),
                (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'),
                (r'.*', 'MORA_HAUPT')]  # default
    regex_tagger = nltk.RegexpTagger(patterns)
    tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger)
    # cutoff = 3, if necessary
    tagger2 = BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = TrigramTagger(tagged_sents, backoff=tagger2)
    tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3)
    tagger5 = tagger4.train(tagged_sents, max_rules=200)

    print
    return tagger5
Exemplo n.º 11
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
# unigrams
from nltk.tag import UnigramTagger
unigram_tagger = UnigramTagger(train_sents)
tagger = UnigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))

# bigrams
from nltk.tag import BigramTagger
bigram_tagger = BigramTagger(train_sents)
tagger = BigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))

# trigrams
from nltk.tag import TrigramTagger
trigram_tagger = TrigramTagger(train_sents)
tagger = TrigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))

# backoff
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)


def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


tagger = backoff_tagger(train_sents,
                        [UnigramTagger, BigramTagger, TrigramTagger],
Exemplo n.º 13
0
one_hot_multi.classes_  # 查看特征名

from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# 从布朗语料库中获取文本数据,切分为句子
sentences = brown.tagged_sents(categories='news')
# 将4000个句子用作训练,623个句子用作测试
train = sentences[:4000]
test = sentences[4000:]
# 创建回退标注器
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
# 查看准确率
trigram.evaluate(test)

# TF-IDF
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# 创建文本
text_data = np.array(
    ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])
# 创建TF-IDF特征矩阵
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# 查看TF-IDF特征矩阵
feature_matrix
feature_matrix.toarray()
Exemplo n.º 14
0
#-----------------------------------------------------

print('Bigram tagger accuracy:')

from nltk.tag import BigramTagger

bigramTagger = BigramTagger(training)

print(bigramTagger.evaluate(testing))

#-----------------------------------------------------
print('Trigram tagger accuracy:')

from nltk.tag import TrigramTagger
trigramTagger = TrigramTagger(training)

print(trigramTagger.evaluate(testing))
#-----------------------------------------------------

#Brill Tagger
from nltk.tag import brill, brill_trainer
# make sure you've got some train_sents!
#brill_tagger = train_brill_tagger(unigramTagger, training)

print('Brill tagger accuracy:')
#print(brill_tagger.evaluate(testing))

#------------------------------------------------------

#			Backoff tagger
>>>print default_tagger.evaluate(brown_tagged_sents)

# N-gram taggers

>>>from nltk.tag import UnigramTagger
>>>from nltk.tag import DefaultTagger
>>>from nltk.tag import BigramTagger
>>>from nltk.tag import TrigramTagger
# we are dividing the data into a test and train to evaluate our taggers.
>>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
>>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
>>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
>>>print unigram_tagger.evaluate(test_data)
>>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger)
>>>print bigram_tagger.evaluate(test_data)
>>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger)
>>>print trigram_tagger.evaluate(test_data)

# Regex tagger 

>>>from nltk.tag.sequential import RegexpTagger
>>>regexp_tagger = RegexpTagger(
         [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
          ( r'.*able$', 'JJ'),                # adjectives
          ( r'.*ness$', 'NN'),         # nouns formed from adj
          ( r'.*ly$', 'RB'),           # adverbs
          ( r'.*s$', 'NNS'),           # plural nouns
          ( r'.*ing$', 'VBG'),         # gerunds
          (r'.*ed$', 'VBD'),           # past tense verbs
          (r'.*', 'NN')                # nouns (default)
Exemplo n.º 16
0
for page in list(root):
    l = []
    text = page.find('text').text.decode('utf8')
    language = page.find('language').text.decode('utf8')
    pos = page.find('pos_tags').text.decode('utf8')
    splitText = text.split(" ")[1:-1]
    posText = pos.split(" ")[1:-1]
    for i in range(len(splitText)):
        l.append((splitText[i], posText[i]))
    data.append(l)
    count = count + 1
shuffle(data)

# Divide data into train and test sets
eightyPercent = count*0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]

# Train
train_data = training_set
tag1 = DefaultTagger('NN')
tag2 = UnigramTagger(train_data, backoff = tag1)
tag3 = BigramTagger(train_data, backoff = tag2)
tag4 = TrigramTagger(train_data, backoff = tag3)

# Accuracy
# print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split())
# print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split())
gold_sentences = test_set
print tag4.evaluate(gold_sentences)
Exemplo n.º 17
0
from nltk.tag import DefaultTagger
#tagger = DefaultTagger('NN')

#print(tagger.evaluate(test_set))

from nltk.tag import UnigramTagger
unigramTagger = UnigramTagger(train_set)
#print(unigramTagger.evaluate(test_set))
#print(unigramTagger.evaluate(train_set))
#unigramTagger2 = UnigramTagger(train_set, cutoff=3)
#print(unigramTagger2.evaluate(test_set))
#print(unigramTagger2.evaluate(train_set))

from nltk.tag import BigramTagger, TrigramTagger
bigramTagger = BigramTagger(train_set, cutoff=2)
trigramTagger = TrigramTagger(train_set, cutoff=3)
#print(bigramTagger.evaluate(test_set))
#print(trigramTagger.evaluate(test_set))

#import brill_tagger_wrapper
#from brill_tagger_wrapper import train_brill_tagger
#brillTagger = train_brill_tagger(unigramTagger, train_set)
#print(brillTagger.evaluate(test_set))


def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

Exemplo n.º 18
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Exemplo n.º 19
0
    def baseline_tagger(self):

        from nltk.corpus import brown
        from nltk.tag import TrigramTagger

        print("Number of words in Brown corpus: 1333212")
        print("Number of unique tags in Brown corpus: 474")

        f = open("input.txt", "r").read()

        file_info = stat("input.txt")

        print("Size of test file: ", file_info.st_size)

        sents_tokens = word_tokenize(f)
        print("Number of tags to be tokenized: ",
              len([j for i in sents_tokens for j in i]))

        t0 = time()
        tagger = TrigramTagger(brown.tagged_sents()[:55000])
        t1 = time()
        nltk_train_time = t1 - t0
        print("Time taken by NLTK for training: ", nltk_train_time)

        nltk_tags = []
        t0 = time()
        for sent in sents_tokens:
            nltk_tags.append(tagger.tag(sent))
        t1 = time()
        nltk_tag_time = t1 - t0
        print("Time taken by NLTK to tag text: ", nltk_tag_time)

        t0 = time()
        self.tokenize()
        self.init_tags()
        self.init_words_tags()
        self.init_dict()
        self.calc_Q()
        self.calc_R()
        t1 = time()
        pos_train_time = t1 - t0

        print("Time taken by pos_tagger to train: ", pos_train_time)

        pos_tagger_tags = []
        t0 = time()
        for sent in sents_tokens:
            pos_tagger_tags.append(self.viterbi(sent))
        t1 = time()
        pos_tag_time = t1 - t0
        print("Time taken by pos_tagger to tag: ", pos_tag_time)

        if nltk_train_time < pos_train_time:
            print("Training time of NLTK is less than pos_tagger by: ",
                  abs(nltk_train_time - pos_train_time))
        else:
            print("Training time of pos_tagger is less than NLTK by: ",
                  abs(nltk_train_time - pos_train_time))

        if nltk_tag_time < pos_tag_time:
            print("Tagging time of NLTK is less than pos_tagger by: ",
                  abs(nltk_tag_time - pos_tag_time))
        else:
            print("Tagging time of pos_tagger is less than NLTK by: ",
                  abs(nltk_tag_time - pos_tag_time))

        nltk_tag_count = defaultdict(int)
        for i in nltk_tags:
            for j in i:
                nltk_tag_count[j[1]] += 1

        pos_tag_count = defaultdict(int)
        for i in pos_tagger_tags:
            for j in i:
                pos_tag_count[j[1]] += 1

        print("POS tags generated by NLTK: ")
        for i in nltk_tag_count.items():
            print(i)

        print("POS tags generated by pos_tagger: ")
        for i in pos_tag_count.items():
            print(i)

        print("Number of unique tags generated by NLTK: ",
              len([i for i in nltk_tag_count.keys()]))

        print("Number of unique tags generated by pos_tagger: ",
              len([i for i in pos_tag_count.keys()]))

        print("NLTK failed to tag", nltk_tag_count[None], "tokens")

        print("pos_tagger failed to tag", pos_tag_count[''], "tokens")

        if nltk_tag_count[None] > pos_tag_count['']:
            print("pos_tagger tagged",
                  abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than NLTK")
        else:
            print("NLTK tagged", abs(nltk_tag_count[None] - pos_tag_count['']),
                  "more tokens than pos_tagger")

        tagged_sents = open("input_tagged.txt", "r").read().splitlines()
        tags = []
        for sent in tagged_sents:
            words = sent.split()
            for word in words:
                m = re.search('(.*)_(.*)', word)
                tags.append(m.group(2))

        n_tags = [j[1] for i in nltk_tags for j in i]
        nltk_count = 0
        for x, y in zip(n_tags, tags):
            if x == y:
                nltk_count += 1

        len_tokens = len([j for i in sents_tokens for j in i])

        print("NLTK accurately tagged", nltk_count, "tokens")
        print("NLTK accuracy score: ", float(nltk_count) / float(len_tokens))

        p_tags = [j[1] for i in pos_tagger_tags for j in i]
        pos_count = 0
        for x, y in zip(p_tags, tags):
            if x == y:
                pos_count += 1

        print("pos_tagger accurately tagged", pos_count, "tokens")
        print("pos_tagger accuracy score: ",
              float(pos_count) / float(len_tokens))

        if nltk_count > pos_count:
            print("NLTK accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than pos_tagger")
        else:
            print("pos_tagger accurately tagged", abs(nltk_count - pos_count),
                  "more tokens than NLTK")
    wordTaggedSentencesTrain, entitiesTrain = Util.tokenize(tagsTrain)
    wordTaggedSentencesTest, entitiesTest = Util.tokenize(tagsTest)

    posTaggedSentencesTrain = Util.posTag(wordTaggedSentencesTrain)
    posTaggedSentencesTest = Util.posTag(wordTaggedSentencesTest)

    completeTaggedSentencesTrain = Util.addEntitiyTaggs(
        posTaggedSentencesTrain, entitiesTrain)
    completeTaggedSentencesTest = Util.addEntitiyTaggs(posTaggedSentencesTest,
                                                       entitiesTest)

    #Gram Taggers
    unigramTagger = UnigramTagger(train=completeTaggedSentencesTrain)
    bigramTagger = BigramTagger(train=completeTaggedSentencesTrain)
    trigramTagger = TrigramTagger(train=completeTaggedSentencesTrain)

    #Unigram
    nerChunkerUnigram = ClassifierChunker(completeTaggedSentencesTrain,
                                          unigramTagger)
    evalUnigram = nerChunkerUnigram.evaluate2(completeTaggedSentencesTest)
    print(evalUnigram)

    #Bigram
    nerChunkerBigram = ClassifierChunker(completeTaggedSentencesTrain,
                                         bigramTagger)
    evalBigram = nerChunkerBigram.evaluate2(completeTaggedSentencesTest)
    print(evalBigram)

    #Trigram
    nerChunkerTrigram = ClassifierChunker(completeTaggedSentencesTrain,
Exemplo n.º 21
0
def tagger_default(corpus):
    default_tagger = DefaultTagger('NOUN')
    tagger1 = UnigramTagger(corpus, backoff=default_tagger)
    tagger2 = BigramTagger(corpus, backoff=tagger1)
    tagger3 = TrigramTagger(corpus, backoff=tagger2)
    return tagger3
Exemplo n.º 22
0
import nltk
from nltk.tag import BigramTagger, TrigramTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
bigramtag = BigramTagger(training)
print(bigramtag.evaluate(testing))
trigramtag = TrigramTagger(training)
print(trigramtag.evaluate(testing))
Exemplo n.º 23
0
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff
Exemplo n.º 24
0
 def train(self, sentence_list):
     noun_fallback = DefaultTagger('NN')
     affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
     unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
     bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
     self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback)
Exemplo n.º 25
0
from nltk.tag import TrigramTagger
# we are dividing the data into a test and train to evaluate our taggers.
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

#Unigram selecciona la clasificación + probable
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.UnigramTagger
unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
print("Unigram Tagger: {}".format(unigram_tagger.evaluate(test_data)))
#Bigram se basa en la palabra actual y la anterior para clasificar
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.BigramTagger
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print("Bigram Tagger: {}".format(bigram_tagger.evaluate(test_data)))
#Trigram se basa en la actual, anterior y anterior a la anterior
#https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.TrigramTagger
trigram_tagger = TrigramTagger(train_data,backoff=bigram_tagger)
print("Trigram Tagger: {}".format(trigram_tagger.evaluate(test_data)))

''' Aquí lo que se ha hecho ha sido crear 3 "taggeadores" N-Gram con un conjunto
    de datos de entrenamiento del corpus brown, que ya estaba clasificado.

    Además, se han podido combinar para que cuando un "taggeador" no sepa que hacer
    pruebe con su "taggeador" N-1 hasta llegar al por defecto de clasificarlo como NN.


    #######################
    ###  Regexp Tagger  ###
    #######################

    Otra opción para crear nuestro propio "taggeador" es recurrir
    a las queridas expresiones regulares con un RegexpTagger
Exemplo n.º 26
0
# Saving pickle and testing it.
with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file:
    pickle.dump(ugb_tagger, file)

with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file:
    pk_tagger = pickle.load(file)

accuracy = pk_tagger.evaluate(test_sents)
print(f"Accuracy of pickled backoff: {accuracy}\n")

# Testing bigram and trigram taggers
bg_tagger = BigramTagger(train_sents)
accuracy = bg_tagger.evaluate(test_sents)
print(f"Accuracy of bigram: {accuracy}\n")

tg_tagger = TrigramTagger(train_sents)
accuracy = tg_tagger.evaluate(test_sents)
print(f"Accuracy of trigram: {accuracy}\n")


def make_backoffs(training, tagger_classes, backoff=None):
    """
        Function for training and make chains of backoff tagger
    """
    # Make a tagger using the previous one as a backoff
    for cls in tagger_classes:
        backoff = cls(training, backoff=backoff)
    return backoff


# Testing the function with all 4 taggers
Exemplo n.º 27
0
from nltk.corpus import wordnet as wn
from os.path import isfile, join
from os import listdir
from pprint import pprint
import gensim.downloader as api
import re
import nltk
import os

TEST_PATH = '../test/untagged'
COMMON_WORDS_PATH = '../resources/1-1000.txt'

TRAINING_SENTS = treebank.tagged_sents()
UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN'))
BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM)
TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM)

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
WORD_VECTORS = api.load("glove-wiki-gigaword-100")
TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))]

# Manual list of words to be considered "irrelevant"
IRRELEVANT_WORDS = ["talk", "seminar", "lecture"]

# manually created ontology tree, which is later extended
TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}}

# code to convert POS tags into the right form for lemmatization
# https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
POS_TO_WORDNET = {
    'JJ': wn.ADJ,
Exemplo n.º 28
0
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs)

defaultTagger = DefaultTagger('NN')
initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger)
brillTagger = train_brill_tagger(initialTagger, brown_train_sents)

tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(brown_train_sents)

bigramTagger = BigramTagger(brown_train_sents)
trigramTagger = TrigramTagger(brown_train_sents)

print("------------Recommended Tagger------------")
print(nltk.pos_tag(sent))

print("------------Default Tagger------------")
print(defaultTagger.tag(sent))

print("------------Unigram Tagger Overrode------------")
unigramTagger = UnigramTagger(model={'Pierre': 'NN'})
print(unigramTagger.tag(sent))

print("------------Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.tag(sent))
Exemplo n.º 29
0
# =============================================================================
"""
""" 1. run tagger with different corpus size (50% and 100%) """
# backoff tagger
tag1_eval = dict()
# train with backoff and Brill
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger)
tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger)
tag1b_tagger = train_brill_tagger(tag1_tagger,
                                  train_sents,
                                  True,
                                  max_rules=100)
tag1_eval['train_time'] = toc()
# test
tic()
tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents)
tag1_eval['test_time'] = toc()
# display results
display_training_metrics(tag1_eval)
"""
# =============================================================================
# finalise a classification-based tagger
# =============================================================================
Exemplo n.º 30
0
import string
from os import listdir
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from os.path import isfile, join, dirname


def backoff_tagger(training, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(training, backoff=backoff)
    return backoff


#train tagger once for entire batch of files to speed things up a little
training = treebank.tagged_sents()
#unigramTagger = UnigramTagger(training, DefaultTagger('NN'))
#bigramtagger = BigramTagger(training)
#trigramTagger = TrigramTagger(training)
#backbitagger = BigramTagger(training, backoff=unigramTagger)
#tagger = TrigramTagger(training, backoff=backbitagger)

tagger = TrigramTagger(training)

#tagger = backoff_tagger(training, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))

saver = open("postagger.pickle", "wb")
pickle.dump(tagger, saver)
saver.close()
print "done"