示例#1
0
def getTrainedTagger():
    train = brown.tagged_sents(simplify_tags=True)
    newTrain = []
    for sen in train:
        newSen = []
        for word, tag in sen:
            newSen.append((word.lower(), tag))
        newTrain.append(newSen)
    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ],
        backoff=nn_tagger)
    at2 = nltk.AffixTagger(newTrain, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(newTrain, backoff=at2)
    ct2 = nltk.NgramTagger(2, newTrain, backoff=ut3)
    return ct2
示例#2
0
    def __init__(self):
        # This is our fast Part of Speech tagger
        brown_train = brown.tagged_sents(categories=['news'])
        regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                                           (r'(-|:|;)$', ':'), (r'\'*$', 'MD'),
                                           (r'(The|the|A|a|An|an)$', 'AT'),
                                           (r'.*able$', 'JJ'),
                                           (r'^[A-Z].*$', 'NNP'),
                                           (r'.*ness$', 'NN'),
                                           (r'.*ly$', 'RB'), (r'.*s$', 'NNS'),
                                           (r'.*ing$', 'VBG'),
                                           (r'.*ed$', 'VBD'), (r'.*', 'NN')])
        self.unigram_tagger = nltk.UnigramTagger(brown_train,
                                                 backoff=regexp_tagger)
        self.bigram_tagger = nltk.BigramTagger(brown_train,
                                               backoff=self.unigram_tagger)

        # This is our semi-CFG; Extend it according to your own needs
        cfg = {}
        cfg["NNP+NNP"] = "NNP"
        cfg["CD+CD"] = "CD"
        cfg["NN+NN"] = "NNI"
        cfg["NNI+NN"] = "NNI"
        cfg["JJ+JJ"] = "JJ"
        cfg["JJ+NN"] = "NNI"
        cfg["VBN+NNS"] = "NNP"
        self.cfg = cfg

        for i, word in enumerate(STOP_WORDS):
            STOP_WORDS[i] = word
示例#3
0
def test_tag():
    """train test"""
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

    tags = unigram_tagger.tag(brown_sents[2007])
    # 评价tag正确率
    print(unigram_tagger.evaluate(brown_tagged_sents), tags)
示例#4
0
文件: reader.py 项目: alex-ten/MSC
def _build_big_vocab(filename, tagset='universal'):
    # WORD IDS AND FREQUENCIES
    # ========================
    # Long list of word sequences separated by <eos>
    data = _read_words(filename)
    # Store tallies of unique words in data, e.g. {''<unk>': 4794, 'the': 4529, '<eos>': 3761}
    counter_words = collections.Counter(data)
    # Creates an ordered list of 2-tuples containing a WORD and its TALLY
    count_pairs = sorted(counter_words.items(), key=lambda x: (-x[1], x[0])) # x[0] is a backup criterion in case -x[1] are equal
    # Creates a tuple of words sorted in descending order from most frequent to least frequent
    words, _ = list(zip(*count_pairs))
    # Assign a unique integer ID to each word
    word_to_id = dict(zip(words, range(len(words))))

    # POS TAGS
    # ========================
    # Tagged long list
    ptb_sents = nltk.corpus.treebank.tagged_sents()
    uni_tag = nltk.UnigramTagger(ptb_sents)
    tagged = uni_tag.tag(data)
    # Store tallies of POS tags, e.g. {''NOUN': 20321, 'DT': 4529, None: 3761}
    counter_pos = collections.Counter([x[1] for x in tagged])

    word_to_id_freq_pos = {}
    for k,v in word_to_id.items():
        pos = uni_tag.tag([k])[0][1]
        if pos is None: pos = 'UNK'
        word_to_id_freq_pos[k] = (v, counter_words[k], pos)

    return word_to_id_freq_pos # Return a dict with unique words as keys and their ID as value
 def __init__(self):
     tsents = mac_morpho.tagged_sents()
     tsents = [[(w.lower(), t) for (w, t) in sent] for sent in tsents
               if sent]
     tagger0 = nltk.DefaultTagger('N')
     tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
     self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
示例#6
0
def tag_it(train, test, regex_pattern, print_errors=False):
    """
    Use tagger hierarchy approach shown in the lecture
    I actually tried some variations and different orders, e.g. regex at the beginning.
    But the below order gave me the best results
    :param train:
    :param test:
    :param regex_pattern:
    :param print_errors:
    :return:
    """

    default_tagger = nltk.DefaultTagger('NOUN')
    regex_tagger = nltk.tag.RegexpTagger(regex_pattern, backoff=default_tagger)
    unigram_tagger = nltk.UnigramTagger(train, backoff=regex_tagger)
    bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(train, backoff=bigram_tagger)

    print(trigram_tagger.evaluate(test))

    # print wrongly classified values
    if print_errors:
        sents = nps_chat.posts()
        untagged = trigram_tagger.tag_sents(sents[((len(sents) * 9) // 10):])
        cfd = nltk.ConditionalFreqDist((word, tag)
                                       for idx1, sent in enumerate(test)
                                       for idx2, (word, tag) in enumerate(sent)
                                       if tag != untagged[idx1][idx2][1])

        for k, v in cfd.items():
            for key, item in v.items():
                print(k, key, item)
示例#7
0
def ngramTagger(train_sents, n=2, defaultTag='NN'):
    t0 = nltk.DefaultTagger(defaultTag)
    if (n <= 0):
        return t0
    elif (n == 1):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True)
        return t1
    elif (n == 2):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True)
        t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True)
        return t2
    else:
        t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True)
        t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True)
        t3 = nltk.TrigramTagger(train_sents, backoff=t2, verbose=True)
        return t3
示例#8
0
    def test_POS_tag_tokenize_words_simple_test(self):
        training_sents = brown.tagged_sents()

        patterns = [ # for regexp tagger
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'),
            (r'.*\'s$', 'POS'),
            (r'.*s$', 'NNS'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*', 'NN')]

        default_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
        unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger)
        bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger)
        
        final_tagger = trigram_tagger

        self.assertEqual(
            [[('who', 'WPS'),
            ('are', 'BER'),
            ('your', 'PP$'),
            ('friend', 'NN'),
            ("'s", 'POS'),
            ('here', 'RB'),
            ('?', '.')]],
            POS_tag_tokenized_phrases(
                [ ['who', 'are', 'your', 'friend', "'s", 'here', '?'] ],
                final_tagger))
def create_tagger():
    """Train a tagger from the Brown Corpus. This should not be called very
    often; only in the event that the tagger pickle wasn't found."""
    print "Building tagger..."
    train_sents = brown.tagged_sents()

    # These regexes were lifted from the NLTK book tagger chapter.
    t0 = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adjectives
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', 'NN')                    # nouns (default)
        ])
    print "got t0"

    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    print "got t1"

    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print "got t2"

    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    print "Built tagger!"
    return t3
示例#10
0
def get_trained_unigram_tagger():
    train_data_input = json.load(
        open(ROOT_DIR +
             '/src/assets/training_Sets/stationsExtractionTrainingSet.json'))
    train_data = [[(element["pos"], element["classification"])
                   for element in sentence] for sentence in train_data_input]
    return nltk.UnigramTagger(train_data)
示例#11
0
def nltk_tagger(brown_words, brown_tags, brown_dev_words):
    training = []
    for brown_sentence, tag_sentence in zip(brown_words, brown_tags):
        words = brown_sentence.split(' ')
        tags = tag_sentence.split(' ')
        sentence_tags = []
        for word, tag in zip(words, tags):
            sentence_tags.append((word, tag))

        sentence_tags.pop(0)
        sentence_tags.pop(0)
        sentence_tags.pop()
        sentence_tags.pop()

        training.append(sentence_tags)

    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(training, backoff=t0)
    t2 = nltk.BigramTagger(training, backoff=t1)
    t3 = nltk.TrigramTagger(training, backoff=t2)

    # IMPLEMENT THE REST OF THE FUNCTION HERE
    tagged = []
    for sentence in brown_dev_words:
        tgd_stc = t3.tag(sentence)
        pairs = []
        for tup in tgd_stc:
            word, tg = tup
            joint = word + '/' + tg
            pairs.append(joint)

        joint = ' '.join(pairs)
        tagged.append(joint + '\n')
    return tagged
def brill_tagger(tagged_sentences):
    wordings = [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
        (r'(The|the|A|a|An|an)$', 'AT'),
        (r'.*able$', 'JJ'),
        (r'.*ness$', 'NN'),
        (r'.*ly$', 'NN'),
        (r'.*ing$', 'VBG'),
        (r'.*ed$', 'VBD'),
        (r'.*ould$', 'MD'),
        (r'.*ment$', 'NN'),
        (r'.*ful$', 'JJ'),
        (r'.*ious$', 'JJ'),
        (r'.*ble$', 'JJ'),
        (r'.*ic$', 'JJ'),
        (r'.*ive$', 'JJ'),
        (r'.*est$', 'JJ'),
        (r'.*ould$', 'MD'),
    ]

    # the part of code is taken as a reference from http://stackoverflow.com/questions/14802442/how-to-use-a-regex-backoff-tagger-in-python-nltk-to-override-nns
    # here we are using the unigram and regex taggers as backoffs for brill tagger
    regex_tagger = nltk.tag.RegexpTagger(wordings)
    unigram_tagger = nltk.UnigramTagger(tagged_sentences, backoff=regex_tagger)
    model = nltk.tag.brill.brill24()
    brill_trainer = nltk.tag.brill_trainer.BrillTaggerTrainer(
        unigram_tagger, model)
    brill_tagger = brill_trainer.train(tagged_sentences)
    return brill_tagger
示例#13
0
 def word_tagger(self):
     default_tagger = nltk.DefaultTagger('NN')
     unigram_tagger = nltk.UnigramTagger(self.training_sents,
                                         backoff=default_tagger)
     bigram_tagger = nltk.BigramTagger(self.training_sents,
                                       backoff=unigram_tagger)
     self.text = bigram_tagger.tag(self.text)
示例#14
0
def dump(config):
    """Loads word embeddngs an calculates neighbors.
    Args:
        config: an instance of TaggerConfiguration
    """

    tagger_dir = config.tagger_dir
    tagger_name = os.path.join(tagger_dir, "tagger.pkl")
    os.makedirs(tagger_dir, exist_ok=True)
    if not os.path.isfile(tagger_name):
        brown_tagged_sents = brown.tagged_sents(tagset='universal')
        size = int(len(brown_tagged_sents) * 0.9)
        train_sents = brown_tagged_sents[:size]
        test_sents = brown_tagged_sents[size:]
        t0 = nltk.DefaultTagger('X')
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        t3 = nltk.TrigramTagger(train_sents, backoff=t2)
        scores = [[t1.evaluate(test_sents), t1], [t2.evaluate(test_sents), t2],
                  [t3.evaluate(test_sents), t3]]
        best_score, best_tagger = max(scores, key=lambda x: x[0])
        print("Finished building POS tagger {0:.2f}%".format(best_score * 100))
        with open(tagger_name, 'wb') as f:
            pkl.dump(best_tagger, f)
    with open(tagger_name, 'rb') as f:
        return pkl.load(f)
    print("Finished saving %s and %s." % (ids_name, distances_name))
示例#15
0
 def __init__(self, train_sents):
     train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]
     #print(train_data)
     self.tagger = nltk.UnigramTagger(train_data)
     self.tagger = nltk.tag.BigramTagger(train_data, backoff=self.tagger)
     self.tagger = nltk.tag.TrigramTagger(train_data, backoff=self.tagger)
     print(self.tagger.evaluate(train_data))
示例#16
0
文件: hw4.py 项目: Rose-Lin/325
def tagging_system(text, name):
    bts = brown.tagged_sents(categories="news", tagset="universal")
    # train hmmtagr on all bts and use hmmtagger to evaluate the result of unigramTagger
    hmmTagr = hmm.HiddenMarkovModelTagger.train(bts)
    uTagr = nltk.UnigramTagger(bts)
    tsent = nltk.word_tokenize(text)
    tagged_sent = [uTagr.tag(tsent)]
    hmm_tagged_sent = hmmTagr.tag(tsent)
    # Comparing with hmm tagger, and set all None tag to th
    for i in range(len(tagged_sent[0])):
        if tagged_sent[0][i][1] == None:
            tagged_sent[0][i] = hmm_tagged_sent[i]
    # print out the accuracy and the tagged text
    if name == 'my_test.txt' or name == 'my_test1.txt':
        print(
            "-------Below is the accuracy analysis of my tagging system on text : {} ------"
            .format(name))
        print("the accuracy of {} is :{}".format(
            name, hmmTagr.evaluate(tagged_sent)))
        print(
            "some mistaken tags and what it should be based on golden standard: "
        )
        for i in range(len(tagged_sent[0])):
            if not tagged_sent[0][i][1] == hmm_tagged_sent[i][1]:
                print("{} should be {}".format(tagged_sent[0][i],
                                               hmm_tagged_sent[i]))
    print("------Below is the outcome of my tagging system on text: {}------".
          format(name))
    print(tagged_sent[0])
示例#17
0
    def __init__(self):
        """Initialization method of :class:`TopicExtractor` class.
        """

        # This is our fast Part of Speech tagger
        #############################################################################
        brown_train = brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
                                           (r'(-|:|;)$', ':'), (r'\'*$', 'MD'),
                                           (r'(The|the|A|a|An|an)$', 'AT'),
                                           (r'.*able$', 'JJ'),
                                           (r'^[A-Z].*$', 'NNP'),
                                           (r'.*ness$', 'NN'),
                                           (r'.*ly$', 'RB'), (r'.*s$', 'NNS'),
                                           (r'.*ing$', 'VBG'),
                                           (r'.*ed$', 'VBD'), (r'.*', 'NN')])
        unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
        self.bigram_tagger = nltk.BigramTagger(brown_train,
                                               backoff=unigram_tagger)
        #############################################################################

        # This is our semi-CFG; Extend it according to your own needs
        #############################################################################
        self.cfg = {}
        self.cfg["NNP+NNP"] = "NNP"
        self.cfg["NN+NN"] = "NNI"
        self.cfg["NNI+NN"] = "NNI"
        self.cfg["JJ+JJ"] = "JJ"
        self.cfg["JJ+NN"] = "NNI"
示例#18
0
def nltk_tagger(brown):
    tagged = []

    training = nltkbrown.tagged_sents(tagset = 'universal')

    #create Unigram, Bigram, Trigram taggers
    unigram_tagger = nltk.UnigramTagger(training)
    bigram_tagger = nltk.BigramTagger(training)
    trigram_tagger = nltk.TrigramTagger(training)

    default_tagger = nltk.DefaultTagger('NOUN')
    bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
    trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

    # tag sentences    
    tagged_sentence = []
    for sentence in brown:
        tags = trigram_tagger.tag(sentence)
        tagged_sentence.append(tags)

    for sentence in tagged_sentence:
        sentence = sentence[2:-1]
        temp = []
        for tup in sentence:
            wordtag = tup[0] + '/' + tup[1]
            temp.append(wordtag)
        tagged.append(temp)

    return tagged
示例#19
0
def _build_tagger():
    global tagger
    file = Path(tagger_path)

    if tagger != None: return

    if file.is_file():
        tagger = object_io.read_object(tagger_path)
    else:
        print('{} - Building train data...'.format(datetime.now()))

        dataset = nltk.corpus.floresta.tagged_sents() + \
                  nltk.corpus.mac_morpho.tagged_sents()
        traindata = [[(w, _simplify_tag(t)) for (w, t) in sent]
                     for sent in dataset]

        print('{} - Training POS tagging model...'.format(datetime.now()))

        tagger = nltk.NgramTagger(
            4,
            traindata,
            backoff=nltk.TrigramTagger(
                traindata,
                backoff=nltk.BigramTagger(
                    traindata,
                    backoff=nltk.UnigramTagger(
                        traindata, backoff=nltk.DefaultTagger('NOUN')))))

        print('{} - Saving tagger object...'.format(datetime.now()))

        object_io.save_object(tagger, tagger_path)
示例#20
0
def main():
    brown_tagged_sents = brown.tagged_sents(categories='news')
    brown_sents = brown.sents(categories='news')
    train_size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:train_size]
    test_sents = brown_tagged_sents[train_size:]
    unseen_sents = brown_sents[train_size + 117]

    # unigram only
    unigram_tagger = nltk.UnigramTagger(train_sents, verbose=True)
    evaluate_tagger(unigram_tagger, test_sents, unseen_sents)

    # previous only
    previous_tagger = PreviousTagTagger(train_sents, verbose=True)
    evaluate_tagger(previous_tagger, test_sents, unseen_sents)

    # default tagger
    t0 = nltk.DefaultTagger('NN')

    # backoff 2
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    evaluate_tagger(t2, test_sents, unseen_sents)

    # backoff 3
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    evaluate_tagger(t3, test_sents, unseen_sents)

    # backoff previous 2
    t1 = PreviousTagTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    evaluate_tagger(t2, test_sents, unseen_sents)

    # backoff previous 3
    t1 = PreviousTagTagger(train_sents, backoff=t0)
    t2 = nltk.UnigramTagger(train_sents, backoff=t1)
    t3 = nltk.BigramTagger(train_sents, backoff=t2)
    evaluate_tagger(t3, test_sents, unseen_sents)

    # backoff previous 4
    t1 = PreviousTagTagger(train_sents, backoff=t0)
    t2 = nltk.UnigramTagger(train_sents, backoff=t1)
    t3 = nltk.BigramTagger(train_sents, backoff=t2)
    t4 = nltk.TrigramTagger(train_sents, backoff=t3)
    evaluate_tagger(t4, test_sents, unseen_sents)
示例#21
0
def train_and_test_tagger():
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories="news")
    size = int(len(brown_tagged_sents) * 0.9)
    train_sents = brown_tagged_sents[:size]
    test_sents = brown_tagged_sents[size:]
    unigram_tagger = nltk.UnigramTagger(train_sents)
    print unigram_tagger.evaluate(test_sents)
示例#22
0
def bitagger_train(train_sents, backoff=False):
    if backoff == True:
        t0 = nltk.DefaultTagger('NN')
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
    else:
        t2 = nltk.BigramTagger(train_sents)
    return t2
示例#23
0
def get_pos_tagger(training, tagger='Perceptron'):
    training = [[(w.lower(), simplify_tag(t)) for (w, t) in sent]
                for sent in training if sent]

    if tagger == 'Perceptron':
        tagger = nltk.tag.PerceptronTagger(load=False)
        tagger.train(training)
    else:
        tagger0 = nltk.DefaultTagger('n')

        if tagger == 'Unigram':
            tagger1 = nltk.UnigramTagger(training, backoff=tagger0)
        elif tagger == 'Bigram':
            tagger1 = nltk.UnigramTagger(training, backoff=tagger0)
            tagger = nltk.BigramTagger(training, backoff=tagger1)

    return tagger
def ngramTagger(train_sents, n=0, defaultTag='NN'):

    t0 = nltk.DefaultTagger(defaultTag)
    if (n <= 0):
        return t0
    elif (n == 1):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        return t1
    elif (n == 2):
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        return t2
    else:
        t1 = nltk.UnigramTagger(train_sents, backoff=t0)
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
        t3 = nltk.TrigramTagger(train_sents, backoff=t2)
        return t3
def train_tagger(train_sents):
    """Train and return a tagger using train_sents.
    """
    tags = [t for sent in train_sents for (w, t) in sent]
    t0 = nltk.DefaultTagger(nltk.FreqDist(tags).max())
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2
 def __init__(self, train_sents, testdata):
     train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]
     test_data = [[(t, c) for w, t, c in sent] for sent in testdata]
     self.tagger = nltk.UnigramTagger(
         train_data)  #nltk.NaiveBayesClassifier.train(train_data)
     self.tagger = nltk.tag.BigramTagger(train_data, backoff=self.tagger)
     self.tagger = nltk.tag.TrigramTagger(train_data, backoff=self.tagger)
     print(self.tagger.evaluate(test_data))
示例#27
0
def trainTagger():
        fd = nltk.FreqDist(brown.words(categories='news'))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        most_freq_words = fd.most_common(15000)
        likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)

        unigram_tagger = nltk.UnigramTagger(model=likely_tags)
        return unigram_tagger
示例#28
0
def performance(cfd, wordlist):

    lt = dict((word, cfd[word].max()) for word in wordlist)

    baseline_tagger = nltk.UnigramTagger(model=lt,
                                         backoff=nltk.DefaultTagger('NN'))

    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
示例#29
0
 def create_trainer(self):
     t0 = nltk.DefaultTagger('NN')
     t1 = nltk.UnigramTagger(self.train_sentences, backoff=t0)  #
     t2 = nltk.BigramTagger(self.train_sentences, backoff=t1)
     t3 = nltk.TrigramTagger(self.train_sentences, backoff=t2)
     output = open('t.pkl', 'wb')
     dump(t3, output, -1)
     output.close()
示例#30
0
def tokenize():
    tsents = floresta.tagged_sents()
    tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
    train = tsents[100:]
    test = tsents[:100]
    tagger0 = nltk.DefaultTagger('n')
    tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
    return tagger1