Пример #1
0
 def trainUniTnT(self):
     """train unigram and tnt seperatly without DefaultTagger"""
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         tnt_tagger = tnt.TnT(N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt evaluated")
         unigram = UnigramTagger(train_sents)
         print(str(k) + " fold: unigram evaluated")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.uni_tagged += unigram.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.tnt = tnt_tagger
     self.unigram = unigram
     self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, []))
     self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, []))
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall(
         unigram, self.uni_tagged, self.org_tagged
     )
     # delete following values so that trainRegexp has the inicial values
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
Пример #2
0
def pos_tag(pos_type, tokenized_sent):
	if pos_type == 'unigram':
		brown_train = pickle.load(open('res/brown_train.pkl', 'rb'))
		unigram_tagger = UnigramTagger(brown_train)
		return unigram_tagger.tag(tokenized_sent)
	elif pos_type == 'max_pos':
		return nltk.pos_tag(tokenized_sent)		
 def tag_unigrams_by_topic(self, dict_of_sentences_by_topic):
     tagged_unigrams_by_topic = {}
     train_sents = mac_morpho.tagged_sents()[:5000]
     tagger = UnigramTagger(train_sents)
     for k, v in dict_of_sentences_by_topic.items():
         tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k])
     return tagged_unigrams_by_topic
	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
Пример #5
0
def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
Пример #6
0
    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN'),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger
Пример #7
0
    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # cardinal numbers
                (r"(The|the|A|a|An|an)$", "AT"),  # articles
                (r".*able$", "JJ"),  # adjectives
                (r".*ness$", "NN"),  # nouns formed from adjectives
                (r".*ly$", "RB"),  # adverbs
                (r".*s$", "NNS"),  # plural nouns
                (r".*ing$", "VBG"),  # gerunds
                (r".*ed$", "VBD"),  # past tense verbs
                (r".*", "NN"),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories="news")
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
            backoff=trigram_tagger,
        )

        return main_tagger
Пример #8
0
def getUnigramTaggerAccuracy(trainingSet, testingSet):
    # trains and returns the accuracy of the UnigramTagger

    # get untagged sentences and gold POS tags
    testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # train tagger
    unigramTagger = UnigramTagger(trainingSet)

    # test tagger and get predicted POS tags
    unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences)
    unigramTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in unigramTaggedSentences]

    # calculate and return accuracy
    return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
Пример #9
0
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
Пример #10
0
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
Пример #11
0
def unigram_bigram_tagger(train_sentences):
    return BigramTagger(
        train_sentences,
        backoff=UnigramTagger(
            train_sentences,
            backoff=DefaultTagger("NN")
        )
    )
Пример #12
0
def contextual_rules(wikicorpus_dir, context_file):
    sentences = wikicorpus(wikicorpus_dir, words=1000000)

    ANONYMOUS = "anonymous"
    for s in sentences:
        for i, (w, tag) in enumerate(s):
            if tag == "NP": # NP = proper noun in Parole tagset.
                s[i] = (ANONYMOUS, "NP")

    ctx = fntbl37()

    tagger = UnigramTagger(sentences)
    tagger = BrillTaggerTrainer(tagger, ctx, trace=0)
    tagger = tagger.train(sentences, max_rules=100)

    #print tagger.evaluate(wikicorpus(10000, start=1))

    with open(context_file, "w") as f:
        for rule in tagger.rules():
            f.write("%s\n" % rule)
Пример #13
0
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag('This is a foo bar sentence'.split())
     expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None),
                 ('bar', 'NN'), ('sentence', None)]
     self.assertEqual(result, expected)
Пример #14
0
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
Пример #15
0
def getUnigramTaggerAccuracy(trainingSet, testingSet):
    # trains and returns the accuracy of the UnigramTagger

    # get untagged sentences and gold POS tags
    testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence]
                                for sentence in testingSet]
    testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence]
                          for sentence in testingSet]

    # train tagger
    unigramTagger = UnigramTagger(trainingSet)

    # test tagger and get predicted POS tags
    unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences)
    unigramTaggedSentencesPOSTags = [[
        taggedWord[1] for taggedWord in sentence
    ] for sentence in unigramTaggedSentences]

    # calculate and return accuracy
    return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
Пример #17
0
 def transform(self, reviews, y=None):
     number_of_adjectives = []
     training_corpus = alp.tagged_sents()
     unitagger = UnigramTagger(training_corpus)
     pos_tag = unitagger.tag
     for review in reviews:
         tokens = re.findall(r"[\w']+|[.,!?;]", review)
         adj = 0
         for token in pos_tag(tokens):
             if token[1] == 'adj':
                 adj += 1
         number_of_adjectives.append([adj])
     return number_of_adjectives
Пример #18
0
def ngram_tagger(tagged_sents):
    patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e
        (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'),
                (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'),
                (r'.*', 'MORA_HAUPT')]  # default
    regex_tagger = nltk.RegexpTagger(patterns)

    tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger)
    # cutoff = 3, if necessary
    tagger2 = BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = TrigramTagger(tagged_sents, backoff=tagger2)

    return tagger3
Пример #19
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
Пример #20
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
Пример #21
0
def get_words_simple(text_string):
    """
    Gets a list of tagged words from an input string
    using whitespace-based tokenisation and a unigram PoS tagger
    """
    # get trained Unigram tagger
    print('Loading unigram tagger...')
    train_sents = treebank.tagged_sents()
    unigram_tagger = UnigramTagger(train_sents)
    # stripping punctuation
    # string.translate() takes a dictionary as input.
    # The dictionary mapping ordinal chars to None is created in place:
    text_string = text_string.translate(
        {ord(c): None
         for c in CHARS_TO_DELETE})
    words = text_string.split()  # crude tokenisation, keeps contractions
    english_stops = stopwords.words('english')
    stops_set = set(english_stops + ADDITIONAL_STOPS)
    cleaned_words = []
    for w in words:
        if w not in stops_set and w not in string.punctuation:
            cleaned_words.append(w)
    return unigram_tagger.tag(cleaned_words)
Пример #22
0
class FeaturesetExtractor():

    def __init__(self):
        self.neg_words = [line.rstrip('\n') for line in open(NEG_WORD)]
        self.pos_words = [line.rstrip('\n') for line in open(POS_WORD)]
        self.anger_words = [line.rstrip('\n') for line in open(ANGER_WORD)]
        self.fear_words = [line.rstrip('\n') for line in open(FEAR_WORD)]
        self.happy_words = [line.rstrip('\n') for line in open(NEG_WORD)]
        self.sad_words = [line.rstrip('\n') for line in open(SAD_WORD)]
        self.tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
        
    def get_featureset(self, data_element):
        mapFeatureset = {}
        size = len(data_element.clean_text)
        word = data_element.clean_text
        list_word = word.split(" ")
        raw = data_element.raw_text
        list_word_raw = raw.split(" ")
        
        tot_pos_words = len(set(list_word) & set(self.pos_words))
        tot_neg_words = len(set(list_word) & set(self.neg_words))
        
        list_anger = tuple(set(list_word) & set(self.anger_words))
        list_fear = tuple(set(list_word) & set(self.fear_words))
        list_happy = tuple(set(list_word) & set(self.happy_words))
        list_sad = tuple(set(list_word) & set(self.sad_words))

        exclamation_count = raw.count("!")
        question_count = raw.count("?")
        uppercase_count = sum(1 for c in raw if c.isupper())

        mapFeatureset["bias"] = 1
        mapFeatureset["word"] = tuple(list_word)
        mapFeatureset["neg_words"] = tot_neg_words
        mapFeatureset["pos_words"] = tot_pos_words
        mapFeatureset["exclamation_count"] = exclamation_count
        mapFeatureset["question_count"] = question_count
        mapFeatureset["list_happy"] = list_happy
        mapFeatureset["list_sad"] = list_sad
        mapFeatureset["list_fear"] = list_fear
        mapFeatureset["list_anger"] = list_anger
        
        pos_tag_temp = self.tagger.tag((word).split(" "))
        list_pos_tag = []
        for element in pos_tag_temp:
            list_pos_tag.append(element[1])
        mapFeatureset["pos_tag"] = tuple(list_pos_tag)
        
        return mapFeatureset   
    def test_ngram_taggers(self):
        unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger)
        bitagger = BigramTagger(self.corpus, backoff=unitagger)
        tritagger = TrigramTagger(self.corpus, backoff=bitagger)
        ntagger = NgramTagger(4, self.corpus, backoff=tritagger)

        encoded = self.encoder.encode(ntagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(repr(ntagger), repr(decoded))
        self.assertEqual(repr(tritagger), repr(decoded.backoff))
        self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff))
        self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff))
        self.assertEqual(repr(self.default_tagger), 
                         repr(decoded.backoff.backoff.backoff.backoff))
Пример #24
0
def train_evaluate_brills(train_data, test_data):
    """Training and evaluating of Brill`s tagger"""
    # Define templates for rules, provided by nltk
    brill.Template._cleartemplates()
    templates = brill.fntbl37()
    # Define initial tagger, tagging by the most common tag
    initial_tagger = UnigramTagger(train_data)
    trainer = brill_trainer.BrillTaggerTrainer(
        initial_tagger=initial_tagger,  # better unk words handling
        templates=templates,
        trace=3,
        deterministic=True)
    tagger = trainer.train(train_data,
                           max_rules=100)  # max number of rules to learn 100
    print("Accuracy:", tagger.evaluate(test_data))
    return tagger.evaluate(test_data)
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag("This is a foo bar sentence".split())
     expected = [
         ("This", "DT"),
         ("is", "VBZ"),
         ("a", "DT"),
         ("foo", None),
         ("bar", "NN"),
         ("sentence", None),
     ]
     self.assertEqual(result, expected)
Пример #26
0
 def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic):
     pre_model = {
         k: map(dict, v)
         for k, v in tagged_unigrams_by_topic.items()
     }
     for k, v in pre_model.items():
         reference_model_by_topic = {}
         for i in v:
             reference_model_by_topic.update(i)
         pre_model[k] = reference_model_by_topic
     dict_model_by_topic = pre_model
     test_sents = mac_morpho.tagged_sents()[:5000]
     tagger_accuracy_by_topic = {}
     for k, v in pre_model.items():
         tagger_accuracy_by_topic[k] = UnigramTagger(
             model=pre_model[k]).evaluate(test_sents)
     return dict_model_by_topic, tagger_accuracy_by_topic
Пример #27
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
Пример #29
0
def train_brill_tagger(tagged_sents):

    # The brill tagger module in NLTK.
    Template._cleartemplates()
    templates = brill24()  # or fntbl37
    # default_tagger = nltk.DefaultTagger('MORA_HAUPT')
    patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e
        (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'),
                (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'),
                (r'.*', 'MORA_HAUPT')]  # default
    regex_tagger = nltk.RegexpTagger(patterns)
    tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger)
    # cutoff = 3, if necessary
    tagger2 = BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = TrigramTagger(tagged_sents, backoff=tagger2)
    tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3)
    tagger5 = tagger4.train(tagged_sents, max_rules=200)

    print
    return tagger5
Пример #30
0
    def __init__(self, rooms):
        self.__sentences = list(brown.tagged_sents(categories=['adventure']))

        for room in rooms:
            """
            This will only really work with UnigramTagger, since there's no context 
            """
            self.__sentences.append([(name, 'NN') for name in room.names])
            self.__sentences.append([(noun, 'NN')
                                     for noun in room.noun_to_item.keys()])
            for item in room:
                for i in range(1000):  # TODO: Weight less hackily
                    self.__sentences.append([
                        (verb, 'VB') for verb in item.verb_to_action.keys()
                    ])

        self.tokenize = word_tokenize
        self.__tagger = UnigramTagger(train=self.__sentences)
        self.tag = self.__tagger.tag

        self.tokens = []
        self.tagged = []
Пример #31
0
class PyTenseShift(object):

    """Initialization of PyTenseShift objects.
    
    The important part when you use the PlPyTenseShift is that
    we allow you to implmenent your own Tagger to optimize your
    results in translating from present to past tense. So, you need
    to implement the taggerinterface and change the second line of
    this code
    """
    def __init__(self, corpus, isPl):
        if isPl:
            self.tagger = FirstTagger(corpus)
        else:
            dtag = DefaultTagger("NN")
            self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)

    """ Tokenize the input sentence into words.
    This kind of representation is better to evaluate.
    
    """
    def _tokenize(self, tense, isPl):
        if isPl:
            return self.tagger.tag(tense)
        else:
            return self.__utag.tag(tokenize(tense))

    def getPastTense(self, tense):
        """Translates sentence given in present tense into past tense 
        
        Args:
            sentence (str): Sentence to translate
        Returns:
            str. Sentence in past tense
        """
        raise NotImplementedError("abstract method")
Пример #32
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
Пример #33
0
def Tagger():
    #Tagger
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    return etiq2
Пример #34
0
def stringFromHTMLParagraph(paraWithTags):
    paraString = ''
    for taggedString in paraWithTags.strings:
        paraString += removeApostrophe(taggedString.string)
    return paraString

def titleFromArticleSoup(soup):
    titleDiv = soup.find(class_ = 'story-heading')
    if not titleDiv:
        titleDiv = soup.find(class_ = 'entry-title')
    return unicode(removeApostrophe(titleDiv.string))

# Set up the tokenizer and the tagger
tokenizer = RegexpTokenizer(r'\w+')
tagger = UnigramTagger(treebank.tagged_sents())

# Open up a redis connection
redisInterface = RedisInterface()

# Print status
print 'Reader ONLINE'

# Run the wait-execute loop
while True:

    while not redisInterface.hasPending():
        sleep(1)

    page = redisInterface.popPending()
    print 'Reading ' + page + ' STARTED'
Пример #35
0
from nltk.corpus import brown
from nltk.tag import UnigramTagger
import cPickle as pickle

INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl"

def write_word_list(filename, word_list):
    out_fp = open(filename, "w")
    print >> out_fp, "\n".join(word_list)

if __name__ == '__main__':
    in_fp = open(INPUT_FILE, "rb") 
    words = pickle.load(in_fp)
    tagger = UnigramTagger(brown.tagged_sents())
    good_words = []
    for word in words:
        tag = tagger.tag([word])[0][1]
        if tag == None:
            continue
        if "NP" in tag:
            continue
        good_words.append(word)
    write_word_list("brown.txt", good_words)
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))
Пример #37
0
one_hot_multi.fit_transform(tagged_tweets)
# 查看特征名
one_hot_multi.classes_  # 查看特征名

from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# 从布朗语料库中获取文本数据,切分为句子
sentences = brown.tagged_sents(categories='news')
# 将4000个句子用作训练,623个句子用作测试
train = sentences[:4000]
test = sentences[4000:]
# 创建回退标注器
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
# 查看准确率
trigram.evaluate(test)

# TF-IDF
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# 创建文本
text_data = np.array(
    ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])
# 创建TF-IDF特征矩阵
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# 查看TF-IDF特征矩阵
Пример #38
0
 def __init__(self, corpus):
     dtag = DefaultTagger("NN")
     self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
Пример #39
0
import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))
Пример #40
0
brown_tagged_sents = brown.tagged_sents(categories='news')
#print(brown_tagged_sents)
# [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...]
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))
# 0.13089484257215028

brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]]
print(default_tagger.evaluate(brown_tagged_sents2))
# 0.3333333333333333

train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063

regexp_tagger = RegexpTagger(
    [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    ( r'(The|the|A|a|An|an)$', 'AT'), # articles
    ( r'.*able$', 'JJ'), # adjectives
Пример #41
0
    airportcode[row['airpt_cd']] = 'AIRPORT'
    airportcode[row['airpt_name']] = 'AIRPORT'
    airportcode[row['city_cd']] = 'CITY'

for row in db.codeshare_words.find():
    codeshareModel[row['word'].strip()] = 'CODESHARE'

TO_MODEL = {}
TO_MODEL['-'] = 'TO'
TO_MODEL['至'] = 'TO'
# ABOVE THREE CAN BE COLLAPSED INTO A GENERIC DATABASE TABLE FOR UNIGRAM TAGGERS
CURRENCY = set(currencycode.keys())
known_tourcodes = {tc: 'TC' for tc in TOURCODES}  # setup tourcodes model

train_sents = treebank.tagged_sents()[:3000]
unigramtagger = UnigramTagger(train_sents, backoff=backoff)
currencytagger = UnigramTagger(model=currencycode,
                               backoff=unigramtagger)  # tag currency
airporttagger = UnigramTagger(model=airportcode,
                              backoff=currencytagger)  # tag airports
codesharetagger = UnigramTagger(model=codeshareModel,
                                backoff=airporttagger)  # tag codeshare
carriertagger = UnigramTagger(model=CARRIER_MODEL,
                              backoff=codesharetagger)  # tag carriers

datetagger = UnigramTagger(model=monthModel,
                           backoff=carriertagger)  # tag months
rtagger = RegexpTagger(patterns, backoff=datetagger)
known_tourcodes_tagger = UnigramTagger(model=known_tourcodes, backoff=rtagger)

dashtagger = UnigramTagger(model=TO_MODEL, backoff=known_tourcodes_tagger)
Пример #42
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division
from nltk.tag import UnigramTagger


if __name__ == '__main__':
    model = {u'Péter': 'N', 'Enikő': 'N', 'szeret': 'V', 'Marit': 'Nacc'}
    tagger = UnigramTagger(model=model)

    print(tagger.tag(['Péter', 'Enikő', 'szeret', 'Marit']))


Пример #43
0
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
testing = treebank.tagged_sents()[2000:]
print(unitagger.evaluate(testing))
Пример #44
0
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John','and','Smith','went','to','NY','and','Germany']
for word, tag in tagger.tag(sentence):
    print(word,'->',tag)
Пример #45
0
class FirstTagger(TaggerInterface):
    
    def __init__(self, corpus):
        dtag = DefaultTagger("NN")
        self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
        
    def tag(self, tense):
        """Does translation from tag generated by tagger into unified format
        
            Args:
                sentence: list of touple (word and its form) which are after verb
            Returns:
                list of touple (word and its form in unified format)
        """
        words = self.__utag.tag(tokenize(tense))
        
        for i, (word, form) in enumerate(words):
            word_info = {}
            
            if form[0] == 'V': word_info['klasa'] = 'czasownik'
            elif form[0] == 'S': word_info['klasa'] = 'rzeczownik'
            elif form[0] == 'A': word_info['klasa'] = 'przymiotnik'
            elif form[0] == 'N': word_info['klasa'] = 'liczebnik'
            elif form[0] == 'Z': word_info['klasa'] = 'zaimek'
            elif form[0] == 'D': word_info['klasa'] = 'przysłówek'
            elif form[0] == 'P': word_info['klasa'] = 'przyimek'
            elif form[0] == 'C': word_info['klasa'] = 'spójnik'
            elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik'
            elif form[0] == 'T': word_info['klasa'] = 'partykuła'
            else: word_info['klasa'] = 'nieznany'
            
            if form[1] == 'S': word_info['liczba'] = 'pojedyńcza'
            elif form[1] == 'P': word_info['liczba'] = 'mnoga'
            
            if(len(form) >= 3):
                if form[2] == 'N': word_info['przypadek'] = 'mianownik'
                elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz'
                elif form[2] == 'D': word_info['przypadek'] = 'celownik'
                elif form[2] == 'A': word_info['przypadek'] = 'biernik'
                elif form[2] == 'I': word_info['przypadek'] = 'narzędnik'
                elif form[2] == 'L': word_info['przypadek'] = 'miejscownik'
                elif form[2] == 'V': word_info['przypadek'] = 'wołacz'
            
            if(len(form) >= 4):
                if form[3] == 'M': word_info['rodzaj'] = 'm'
                elif form[3] == 'P': word_info['rodzaj'] = 'm'
                elif form[3] == 'A': word_info['rodzaj'] = 'm'
                elif form[3] == 'I': word_info['rodzaj'] = 'm'
                elif form[3] == 'F': word_info['rodzaj'] = 'ż'
                elif form[3] == 'N': word_info['rodzaj'] = 'n'
                elif form[3] == 'O': word_info['rodzaj'] = 'm'
                elif form[3] == 'R': word_info['rodzaj'] = 'ż'
                elif form[3] == 'T': word_info['rodzaj'] = 'ż'
            if(len(form) >= 6):
                if form[5] == '1': word_info['osoba'] = 'pierwsza'
                elif form[5] == '2': word_info['osoba'] = 'druga'
                elif form[5] == '3': word_info['osoba'] = 'trzecia'
                elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik'
                elif form[5] == 'B': word_info['osoba'] = 'bezosobnik'
                elif form[5] == 'U': word_info['osoba'] = 'imiesłów'
                elif form[5] == 'W': word_info['osoba'] = 'imiesłów'
            if(len(form) >= 7):
                if form[6] == 'T': word_info['czas'] = 'teraźniejszy'
                elif form[6] == 'P': word_info['czas'] = 'przeszły'
                elif form[6] == 'F': word_info['czas'] = 'przyszły'
            if(len(form) >= 8):
                if form[7] == 'O': word_info['tryb'] = 'oznajmujący'
                elif form[7] == 'P': word_info['tryb'] = 'przypuszczający'
                elif form[7] == 'R': word_info['tryb'] = 'rozkazujący'
            if(len(form) >= 9):
                if form[8] == 'D': word_info['aspekt'] = 'dokonane'
                elif form[8] == 'N': word_info['aspekt'] = 'niedokonane'
            
            words[i] = (words[i][0], word_info)
        
        return words
Пример #46
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Пример #47
0
from nltk.tag import UnigramTagger, DefaultTagger
from nltk.corpus import treebank

from tag_util import train_sents, test_sents

# train
default_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=default_tagger)

# test
print(tagger.evaluate(test_sents))

# save to pickle
import pickle
with open('unitagger.pkl', 'wb') as output:
    pickle.dump(tagger, output)

# load from pickle
with open('unitagger.pkl', 'rb') as data_file:
    tagger2 = pickle.load(data_file)

print(tagger2.evaluate(test_sents))

# or nltk.data.load('unitagger.pkl') to load
Пример #48
0
######## UNIGRAM TAGGER ##########

from nltk.tag import UnigramTagger
from nltk.corpus import treebank

#We use the first 3000 sentences of the treebank corpus as the training set to initialize
#the UnigramTagger class
#Unigram tagger can be trained by giving it a list of tagged sentences at initialization.
train_sents=treebank.tagged_sents()[:3000]
tagger=UnigramTagger(train_sents)
print treebank.sents()[0]
print tagger.tag(treebank.sents()[0])

test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)



tagger=UnigramTagger(model={'Pierre':'NN'})
tagger.tag(treebank.sents())[0]
Пример #49
0
def postag(
    templates=None,
    tagged_data=None,
    num_sents=1000,
    max_rules=300,
    min_score=3,
    min_acc=None,
    train=0.8,
    trace=3,
    randomize=False,
    ruleformat="str",
    incremental_stats=False,
    template_stats=False,
    error_output=None,
    serialize_output=None,
    learning_curve_output=None,
    learning_curve_take=300,
    baseline_backoff_tagger=None,
    separate_baseline_data=False,
    cache_baseline_tagger=None):
    """
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    """

    # defaults
    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
    if templates is None:
        from nltk.tag.brill import describe_template_sets, brill24
        # some pre-built template sets taken from typical systems or publications are
        # available. Print a list with describe_template_sets()
        # for instance:
        templates = brill24()
    (training_data, baseline_data, gold_data, testing_data) = \
       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)

    # creating (or reloading from cache) a baseline tagger (unigram tagger)
    # this is just a mechanism for getting deterministic output from the baseline between
    # python versions
    if cache_baseline_tagger:
        if not os.path.exists(cache_baseline_tagger):
            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
            with open(cache_baseline_tagger, 'w') as print_rules:
                pickle.dump(baseline_tagger, print_rules)
            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
        with open(cache_baseline_tagger, "r") as print_rules:
            baseline_tagger= pickle.load(print_rules)
            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
    else:
        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
        print("Trained baseline tagger")
    if gold_data:
        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))

    # creating a Brill tagger
    tbrill = time.time()
    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
    print("Training tbl tagger...")
    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
    if gold_data:
        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))

    # printing the learned rules, if learned silently
    if trace == 1:
        print("\nLearned rules: ")
        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))


    # printing template statistics (optionally including comparison with the training data)
    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
    if  incremental_stats:
        print("Incrementally tagging the test data, collecting individual rule statistics")
        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
        print("    Rule statistics collected")
        if not separate_baseline_data:
            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
                  "will be artificially high")
        trainstats = brill_tagger.train_stats()
        if template_stats:
            brill_tagger.print_template_statistics(teststats)
        if learning_curve_output:
            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
    else:
        print("Tagging the test data")
        taggedtest = brill_tagger.tag_sents(testing_data)
        if template_stats:
            brill_tagger.print_template_statistics()

    # writing error analysis to file
    if error_output is not None:
        with open(error_output, 'w') as f:
            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
            f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
        print("Wrote tagger errors including context to {0}".format(error_output))

    # serializing the tagger to a pickle file and reloading (just to see it works)
    if serialize_output is not None:
        taggedtest = brill_tagger.tag_sents(testing_data)
        with open(serialize_output, 'w') as print_rules:
            pickle.dump(brill_tagger, print_rules)
        print("Wrote pickled tagger to {0}".format(serialize_output))
        with open(serialize_output, "r") as print_rules:
            brill_tagger_reloaded = pickle.load(print_rules)
        print("Reloaded pickled tagger from {0}".format(serialize_output))
        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
        if taggedtest == taggedtest_reloaded:
            print("Reloaded tagger tried on test set, results identical")
        else:
            print("PROBLEM: Reloaded tagger gave different results on test set")
Пример #50
0
def train_tagger(language, model_type, feature, train_sents):
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
    elif model_type == 'crf':
        tagger = CRFTagger()
        tagger.train(train_sents,
                     'taggers/{0}/{1}/crf.pickle'.format(language, feature))
    elif model_type == 'perceptron':
        tagger = PerceptronTagger(load=False)
        tagger.train(train_sents)

    return tagger
Пример #51
0
def postag(
    templates=None,
    tagged_data=None,
    num_sents=1000,
    max_rules=300,
    min_score=3,
    min_acc=None,
    train=0.8,
    trace=3,
    randomize=False,
    ruleformat="str",
    incremental_stats=False,
    template_stats=False,
    error_output=None,
    serialize_output=None,
    learning_curve_output=None,
    learning_curve_take=300,
    baseline_backoff_tagger=None,
    separate_baseline_data=False,
    cache_baseline_tagger=None):
    """
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    """

    # defaults
    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
    if templates is None:
        from nltk.tag.brill import describe_template_sets, brill24
        # some pre-built template sets taken from typical systems or publications are
        # available. Print a list with describe_template_sets()
        # for instance:
        templates = brill24()
    (training_data, baseline_data, gold_data, testing_data) = \
       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)

    # creating (or reloading from cache) a baseline tagger (unigram tagger)
    # this is just a mechanism for getting deterministic output from the baseline between
    # python versions
    if cache_baseline_tagger:
        if not os.path.exists(cache_baseline_tagger):
            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
            with open(cache_baseline_tagger, 'w') as print_rules:
                pickle.dump(baseline_tagger, print_rules)
            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
        with open(cache_baseline_tagger, "r") as print_rules:
            baseline_tagger= pickle.load(print_rules)
            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
    else:
        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
        print("Trained baseline tagger")
    if gold_data:
        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))

    # creating a Brill tagger
    tbrill = time.time()
    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
    print("Training tbl tagger...")
    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
    if gold_data:
        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))

    # printing the learned rules, if learned silently
    if trace == 1:
        print("\nLearned rules: ")
        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))


    # printing template statistics (optionally including comparison with the training data)
    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
    if  incremental_stats:
        print("Incrementally tagging the test data, collecting individual rule statistics")
        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
        print("    Rule statistics collected")
        if not separate_baseline_data:
            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
                  "will be artificially high")
        trainstats = brill_tagger.train_stats()
        if template_stats:
            brill_tagger.print_template_statistics(teststats)
        if learning_curve_output:
            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
    else:
        print("Tagging the test data")
        taggedtest = brill_tagger.batch_tag(testing_data)
        if template_stats:
            brill_tagger.print_template_statistics()

    # writing error analysis to file
    if error_output is not None:
        with open(error_output, 'w') as f:
            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
            for e in error_list(gold_data, taggedtest):
                f.write(e+'\n')
        print("Wrote tagger errors including context to {0}".format(error_output))

    # serializing the tagger to a pickle file and reloading (just to see it works)
    if serialize_output is not None:
        taggedtest = brill_tagger.batch_tag(testing_data)
        with open(serialize_output, 'w') as print_rules:
            pickle.dump(brill_tagger, print_rules)
        print("Wrote pickled tagger to {0}".format(serialize_output))
        with open(serialize_output, "r") as print_rules:
            brill_tagger_reloaded = pickle.load(print_rules)
        print("Reloaded pickled tagger from {0}".format(serialize_output))
        taggedtest_reloaded = brill_tagger.batch_tag(testing_data)
        if taggedtest == taggedtest_reloaded:
            print("Reloaded tagger tried on test set, results identical")
        else:
            print("PROBLEM: Reloaded tagger gave different results on test set")
Пример #52
0
import re
import nltk
import pickle

#Importing lemmatizer 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#Importing tagger
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()
tagger=UnigramTagger(train_sents)

#Importing replacers
from replacers import RegexReplacer
from replacers import AntonymReplacer
replacer=RegexReplacer()
from nltk.tokenize import RegexpTokenizer
tokenizer =RegexpTokenizer("[\w']+")
from random import shuffle


#Importing Chunkers
import chunkers
from nltk.corpus import treebank_chunk
chunker=chunkers.TagChunker(treebank_chunk.chunked_sents())


max_key=100
Пример #53
0
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
Пример #54
0
# nouns only
nouns_bow = cleaned_bow
for word in cleaned_bow.columns:
    pos = pos_tag(list(word))[0][1]
    if pos != 'NN':
        nouns_bow = nouns_bow.drop(word, axis=1)

topic_words(NMF_vars(10, nouns_bow)[0], nouns_bow)

topic_words(
    NMF_vars(5, nouns_bow.drop('girl', axis=1))[0],
    nouns_bow.drop('girl', axis=1))

# try different tagger
nouns_bow_2 = cleaned_bow
tagger = UnigramTagger(brown.tagged_sents())
for word in cleaned_bow.columns:
    pos = tagger.tag(list(word))[0][1]
    if pos != 'NN':
        nouns_bow_2 = nouns_bow_2.drop(word, axis=1)

for num in range(2, 6):
    topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2)

topic_words(NMF_vars(10, nouns_bow_2)[0], nouns_bow_2)

# remove the word 'total'
topic_words(
    NMF_vars(10, nouns_bow_2.drop('total', axis=1))[0],
    nouns_bow_2.drop('total', axis=1))
Пример #55
0
 def __init__(self, corpus, isPl):
     if isPl:
         self.tagger = FirstTagger(corpus)
     else:
         dtag = DefaultTagger("NN")
         self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
Пример #56
0
import nltk
import json

from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(tagset='universal'))
sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
for word, tag in tagger.tag(sent):
	if tag == "VERB":
		print(word, '->', tag)


verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+')
with open("../assets/inputText/all_questions.txt", 'r') as all_lines:
	for line in all_lines:
		splitLine = line.split(' ')
		for word, tag in tagger.tag(splitLine):
			if tag == "VERB":
				verbs_tagged.write(word + "\n")
				#verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
				



Пример #57
0
import string
'''import replacer
from replacer import RegexpReplacer
from replacer import RepeatReplacer'''
import linecache
import matplotlib.pyplot as plt
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='firefox.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
VBD_count = 0
Пример #58
0
	#seja procurado em cada um dos filhos da árvore e concatena o resultado
	#se este for favorável
        for child in myTree:
                if (type(child) is Tree):
                        list_of_phrases = ExtractPhrases(child, phrase)
                        if (len(list_of_phrases) > 0):
                                myPhrases.extend(list_of_phrases)
	#Retorna a lista de padrões encontrados
        return myPhrases

#Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N)
etiqPadrao = DefaultTagger('N')
#Pega o trainning set a partir das tagged_sents() do mac_morpho
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000]
#Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho
etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao)

coment = str(input("Entre com o texto: "))
if coment == "default":
        coment = open("default.txt", "r").read().replace("\n", " ")
#O texto é convertido em tokens
tokens=nltk.word_tokenize(coment.lower())
#É etiquetada cada token do texto
tags = etiq.tag(tokens)

#É criado o analisador de expresões regulares contendo os padrões procurados
analiseGramatical = RegexpParser(r"""
		PADRAO7: {<N><ADJ>}
        PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
        PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?}
        PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?}