Exemplo n.º 1
0
def no_backoff_taggers(test, train, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)

    info('training {} taggers without backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # unigram tagger
    uni_tagger = UnigramTagger(train)
    # bigram tagger
    bi_tagger = BigramTagger(train)
    # trigram tagger
    tri_tagger = TrigramTagger(train)

    info(uni_tagger)
    uni_score = uni_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(uni_score))

    info(bi_tagger)
    bi_score = bi_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(bi_score))

    info(tri_tagger)
    tri_score = tri_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(tri_score))
Exemplo n.º 2
0
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus):
    words = [word for sent in lookup_tagger_basis for word in sent]
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(corpus.tagged_words())
    most_freq_words = fd.most_common(200)
    likely_tags = dict(
        (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words)
    baseline_tagger = UnigramTagger(model=likely_tags)
    result = baseline_tagger.evaluate(test_set)
    return result
Exemplo n.º 3
0
    def pos_tag(self):
        tokenize_obj = NLTKTokenize(self.options)
        res = tokenize_obj.tokenize()
        tokens = res['result']
        tags = []

        # Performs Bigram / Unigram / Regex Tagging
        if self.options.get('tagger') in ['unigram', 'bigram', 'regex']:
            trainer = self.options['train'] if self.options.get(
                'train') in TRAINERS else DEFAULT_TRAIN

            train = brown.tagged_sents(categories=trainer)

            # Create your custom regex tagging pattern here
            regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
                                      (r'.*able$', 'JJ'),
                                      (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'),
                                      (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'),
                                      (r'.*ed$', 'VBD'), (r'.*', 'NN')])

            current = os.path.dirname(os.path.abspath(__file__))

            # Unigram tag training data load / dump pickle
            pkl_name = current + '/trained/unigram_' + trainer + '.pkl'
            if os.path.isfile(pkl_name):
                with open(pkl_name, 'rb') as pkl:
                    unigram_tag = load(pkl)
            else:
                unigram_tag = UnigramTagger(train, backoff=regex_tag)
                with open(pkl_name, 'wb') as pkl:
                    dump(unigram_tag, pkl, -1)

            # Bigram tag training data load / dump pickle
            if self.options['tagger'] == 'bigram':
                pkl_name = current + '/trained/bigram_' + trainer + '.pkl'
                if os.path.isfile(pkl_name):
                    with open(pkl_name, 'rb') as pkl:
                        bigram_tag = load(pkl)
                else:
                    bigram_tag = BigramTagger(train, backoff=unigram_tag)
                    with open(pkl_name, 'wb') as pkl:
                        dump(bigram_tag, pkl, -1)
                tags = bigram_tag.tag(tokens)  # Bigram tagging performed here
            elif self.options['tagger'] == 'unigram':
                tags = unigram_tag.tag(
                    tokens)  # Unigram tagging performed here
            else:
                tags = regex_tag.tag(tokens)  # Regex tagging performed here

        # Performs default pos_tag
        elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos':
            tags = pos_tag(tokens)

        return self._dump(tags)
Exemplo n.º 4
0
def backoff_taggers(test, train, save, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)
    info('training {} taggers with backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # UNIGRAM TAGGER WITH BACKOFF
    uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger)

    # BIGRAM TAGGER WITH BACKOFF
    bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff)

    # TRIGRAM TAGGER WITH BACKOFF
    tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff)

    info(uni_tagger_backoff)
    uni_backoff_score = uni_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(uni_backoff_score))

    info(bi_tagger_backoff)
    bi_backoff_score = bi_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(bi_backoff_score))

    info(tri_tagger_backoff)
    tri_backoff_score = tri_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(tri_backoff_score))

    if not save:
        return

    accuracy_dict = {}
    accuracy_dict['uni'] = uni_backoff_score
    accuracy_dict['bi'] = bi_backoff_score
    accuracy_dict['tri'] = tri_backoff_score

    # Saving our Trigram-tagger with backoff
    if uni_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(uni_tagger_backoff, output, -1)
    elif bi_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(bi_tagger_backoff, output, -1)
    elif tri_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus)
        dump(tri_tagger_backoff, output, -1)
    output.close()
    info('saving %s...\n', tagger_file)
Exemplo n.º 5
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
Exemplo n.º 6
0
 def __init__(self, mode, train_sents):
     if mode == TRIGRAM:
         self.tagger = UnigramTagger(train_sents)
         self.tagger = BigramTagger(train_sents, backoff=self.tagger)
         self.tagger = TrigramTagger(train_sents, backoff=self.tagger)
     elif HDM:
         self.tagger = HiddenMarkovModelTagger.train(train_sents)
Exemplo n.º 7
0
def train_tagger(corpus_name, corpus):
	"""
	Train the taggers and saves them
	
	Args:
		corpus_name: 	name of the corpus used to create the tagger
		corpus: 		corpus for creating the tagger
	"""
	
	#List of n-gram taggers names
	complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES]
	
	# Training UnigramTagger
	tagger1 = UnigramTagger(corpus)
	utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
	print "UnigramTagger trained with", corpus_name
	
	# Training BigramTagger
	tagger2 = BigramTagger(corpus)
	utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
	print "BigramTagger trained with", corpus_name
	
	# Training TrigramTagger
	tagger3 = TrigramTagger(corpus)
	utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
	print "TrigramTagger trained with", corpus_name
Exemplo n.º 8
0
class Tagger(object):
    def __init__(self, cess_name="cess_esp"):
        """
            Tagger object.
            Allows to specify a cess.
        """
        cess = getattr(nltk.corpus, cess_name)
        self.wnl = WordNetLemmatizer()
        self.ut = UnigramTagger(cess.tagged_sents())

    def pos_tag(self, tokens, lemmatize=False):
        def clean_tag(tag):
            def get_type(tag):
                if tag[1]:
                    return tag[1][0].upper()
                return "X"
            if lemmatize:
                return (self.wnl.lemmatize(tag[0]), get_type(tag))
            return (tag[0], get_type(tag))

        if type(tokens) == str:
            tokens = tokens.split()

        return [clean_tag(a) for a in self.ut.tag(tokens)]

    def get_main_words(self, tokens, lemmatize=True, type_w=False):
        def cond(t):
            if type_w:
                for type_w_ in type_w:
                    if t[1].lower().startswith(type_w_.lower()):
                        return True
                return False
            return True

        return filter(cond, self.pos_tag(tokens, lemmatize=lemmatize))
Exemplo n.º 9
0
def get_pos_tagger():
    from nltk.corpus import brown
    regexp_tagger = nltk.RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])
    brown_train = brown.tagged_sents()
    unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
    bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
    trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

    # Override particular words
    main_tagger = nltk.RegexpTagger(
        [(r'(A|a|An|an)$', 'ex_quant'),
         (r'(Every|every|All|all)$', 'univ_quant')],
        backoff=trigram_tagger)

    return main_tagger
Exemplo n.º 10
0
 def __init__(self, cess_name="cess_esp"):
     """
         Tagger object.
         Allows to specify a cess.
     """
     cess = getattr(nltk.corpus, cess_name)
     self.wnl = WordNetLemmatizer()
     self.ut = UnigramTagger(cess.tagged_sents())
Exemplo n.º 11
0
def lookupTagger(r, c):  # r = range, c = corpus
    if (c == "brown"):
        fDist = ConditionalFreqDist(brownTW)
        freqDist = FreqDist(brown.words())
        wordsR = freqDist.most_common(r)
        likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR)
        baseline_tagger = UnigramTagger(model=likely_tags,
                                        backoff=nltk.DefaultTagger("NN"))
        return baseline_tagger
    if (c == "chat"):
        fDist = ConditionalFreqDist(chatTW)
        freqDist = FreqDist(chat.words())
        wordsR = freqDist.most_common(r)
        likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR)
        baseline_tagger = UnigramTagger(model=likely_tags,
                                        backoff=nltk.DefaultTagger("NN"))
        return baseline_tagger
Exemplo n.º 12
0
 def _model_definition(self) -> UnigramTagger:
     """Function to define and compile the model.
     
     Returns:
       Model object.
     """
     t0 = DefaultTagger('NOUN')
     return UnigramTagger([[(".", "PUNCT")]], backoff=t0)
Exemplo n.º 13
0
def TrainTaggers(training, testing):
    global results
    Unigram = UnigramTagger(training, backoff = default)
    print('unigram trained')
    Bigram = BigramTagger(training, backoff = Unigram)
    print('bigram trained')
    Trigram = TrigramTagger(training, backoff = Bigram)
    print('trigram trained')
    results += [Trigram.evaluate(testing)]
Exemplo n.º 14
0
    def __init__(self, train_sents, to_detect_list, n_gram=1):
        train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]

        self.tagger = UnigramTagger(train_data)
        if n_gram > 1:
            self.tagger = BigramTagger(train_data, backoff=self.tagger)
        if n_gram > 2:
            self.tagger = TrigramTagger(train_data, backoff=self.tagger)
        self.to_detect_list = to_detect_list
Exemplo n.º 15
0
    def train(self, model_path):
        corpus = [[(token.lower(), tag) for token, tag in sent]
                  for sent in CORPUS]

        unigram_tagger = UnigramTagger(corpus, backoff=DefaultTagger('UNK'))
        bigram_tagger = BigramTagger(corpus, backoff=unigram_tagger)

        with open(model_path, "wb") as model_file:
            pickle.dump(bigram_tagger, model_file)
Exemplo n.º 16
0
def lookup_tag(num_sampling):
    raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship'
    #Get the frequency distribution of the words
    fd = FreqDist(brown.words(categories='news'))
    #Get the most frequent tag of each word in the corpus
    cfd = ConditionalFreqDist(brown.tagged_words(
        categories='news'))  #, backoff=nltk.DefaultTagger('NN'))
    #Get the first 100 most common words
    most_freq_words = fd.most_common(num_sampling)
    #Create a dictionary in form of  a tuple (word, most_likely_tag)
    likely_tags = dict(
        (word, cfd[word].max()) for (word, _) in most_freq_words)
    #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic

    lookup_tagger = UnigramTagger(model=likely_tags)
    tagged = lookup_tagger.tag(word_tokenize(raw))
    print(tagged)
    score = lookup_tagger.evaluate(brown_tagged_sents)
    print(score)
Exemplo n.º 17
0
    def __init__(self, train_sents):
        """Show parameters.

        train_sents: trained sentences which have already been tagged.
        using Brown, conll2000, and TreeBank corpus.
        """
        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)
Exemplo n.º 18
0
def train_tagger(corpus_name, corpus):
    """ Function to train tagger. """
    # Training UnigramTagger.
    uni_tag = UnigramTagger(corpus)
    save_tagger('{}_unigram.tagger'.format(corpus_name), uni_tag)
    # Training BigramTagger.
    bi_tag = BigramTagger(corpus, backoff=uni_tag)
    save_tagger('{}_bigram.tagger'.format(corpus_name), bi_tag)
    _msg = str("Tagger trained with {} using "
               "UnigramTagger and BigramTagger.").format(corpus_name)
    print(_msg, file=sys.stderr)
Exemplo n.º 19
0
def create_tagger(sents,patterns=PATTERNS,maxngram=4):
    '''Обучение Backoff tagger на каком-либо корпусе предложений'''
    
    train = sents
    def_tagger = DefaultTagger('NN')
    re_tagger = RegexpTagger(patterns, backoff=def_tagger)
    uni_tagger = UnigramTagger(train, backoff=re_tagger) 
    bi_tagger = BigramTagger(train, backoff=uni_tagger) 
    tri_tagger = TrigramTagger(train, backoff=bi_tagger) 
    ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger)
    return ngram_tagger
Exemplo n.º 20
0
def get_tagger(type="StandfordPOSTagger"):
    if type == "Custom":
        brown_tagged_sents = brown.tagged_sents(categories='news',
                                                tagset='universal')
        t0 = DefaultTagger('NOUN')
        t1 = UnigramTagger(brown_tagged_sents, backoff=t0)
        t2 = BigramTagger(brown_tagged_sents, backoff=t1)
    else:
        t2 = StanfordPOSTagger(
            'data/./models/wsj-0-18-bidirectional-distsim.tagger',
            '3rdparty_libs/stanford-postagger.jar')

    return t2
Exemplo n.º 21
0
 def __init__(self, modelpath, candidates):
     self.modelpath = modelpath
     self.bus_counter = 0
     with open(modelpath + 'all_highest_probs_' + str(candidates) + '.json',
               'r') as f:
         self.candidates = json.load(f)
     with open(
             modelpath +
             'inject_refcoco_refrnn_compositional_3_512_1/4eval_greedy.json',
             'r'
     ) as f:  # 'restoredmodel_refs_greedy.json') as f: restoredmodel_refs_greedy/4eval_greedy
         self.refs = json.load(f)
     self.words_that_are_names = list()
     with open("./noun_list_long.txt", 'r') as f:
         for row in f.readlines():
             self.words_that_are_names.append(row.strip())
     self.unigram_tagger = UnigramTagger(brown.tagged_sents())
     self.zero_shot_refs = defaultdict()
     self.non_noun_counter = 0
     self.baseline_top_1 = defaultdict()
     self.baseline_top_5 = defaultdict()
     self.baseline_top_10 = defaultdict()
def task3(data, corpus):
    fd = FreqDist(corpus.words())
    cfd = ConditionalFreqDist(corpus.tagged_words())
    most_freq_words = sorted(list(fd.items()),
                             key=lambda x: x[1],
                             reverse=True)[:200]
    most_freq_words = list(map(lambda x: x[0], most_freq_words))
    likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
    lookup_tagger = UnigramTagger(model=likely_tags)
    for str in ["brown50", "brown90", "nps50", "nps90"]:
        tagger = CombinedTagger(train=data["train_" + str],
                                default=lookup_tagger,
                                name=str)
        test_tagger(tagger, data)
Exemplo n.º 23
0
def generateTagger():
    default_tagger = DefaultTagger('V')
    patterns = [
        (r'.*o$', 'NMS'),  # noun masculine singular
        (r'.*os$', 'NMP'),  # noun masculine plural
        (r'.*a$', 'NFS'),  # noun feminine singular
        (r'.*as$', 'NFP')  # noun feminine singular
    ]
    regexp_tagger = RegexpTagger(patterns, backoff=default_tagger)
    #train nltk.UnigramTagger using tagged sentences from cess_esp
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)

    return combined_tagger
Exemplo n.º 24
0
    def train(self,
              corpus: Corpus,
              evaluate: bool = True,
              config: dict = None) -> Union[None, Dict[str, Dict[str, float]]]:
        """Train method.

        Args:
          corpus: Corpus to train model.
          evaluate: Flag to return evaluation of the model.
          config: Training config dict (not used for this model).

        Returns: 
          Model evaluation metrics.
        """
        if self.model is None:
            self._model_definition()

        self.model = UnigramTagger(corpus.train.sentences,
                                   backoff=DefaultTagger('NOUN'))

        if evaluate:
            return self.evaluate(corpus)
        return None
Exemplo n.º 25
0
	def __init__(self):
		if os.path.exists('tagger_spanish.pickle'):
			with open('tagger_spanish.pickle', 'r') as file_obj:
			    self.tagger = pickle.load(file_obj)
		else:
			print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...'
			from nltk import UnigramTagger, BigramTagger, TrigramTagger
			from nltk.corpus import cess_esp
			sents = cess_esp.tagged_sents()
			unigram_tagger = UnigramTagger(sents)
			bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word
			self.tagger = unigram_tagger
			with open('tagger_spanish.pickle', 'w') as file_obj:
			    pickle.dump(self.tagger, file_obj)		# Dump trained tagger
 def __init__(self, train=None, default=None, name=None):
     self.name = name
     # As found on page 199 of the nltk book
     regexps = [
         (r'.*ing$', 'VBG'),  # gerunds
         (r'.*ed$', 'VBD'),  # simple past
         (r'.*es$', 'VBZ'),  # 3rd singular present
         (r'.*ould$', 'MD'),  # modals
         (r'.*\'s$', 'NN$'),  # possessive nouns
         (r'.*s$', 'NNS'),  # plural nouns
         (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     ]
     self.default = default
     self.regex = RegexpTagger(regexps, backoff=self.default)
     self.unigram = UnigramTagger(train=train, backoff=self.regex)
     self.bigram = BigramTagger(train=train, backoff=self.unigram)
Exemplo n.º 27
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """
    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))

    return trigram_tagger
Exemplo n.º 28
0
def ngram_tag_with_backoff():
    fd = FreqDist(brown.words(categories='news'))
    #Get the most frequent tag of each word in the corpus
    cfd = ConditionalFreqDist(brown.tagged_words(
        categories='news'))  #, backoff=nltk.DefaultTagger('NN'))
    #Get the first 100 most common words
    most_freq_words = fd.most_common(1000000)
    #Create a dictionary in form of  a tuple (word, most_likely_tag)
    likely_tags = dict(
        (word, cfd[word].max()) for (word, _) in most_freq_words)
    #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic
    lookup_tagger = UnigramTagger(model=likely_tags)
    #With Backoff
    train_len = int(len(brown_tagged_sents) * 0.9)
    print(brown_tagged_sents[train_len:])
    bigram_tagger = BigramTagger(brown_tagged_sents[:train_len],
                                 backoff=lookup_tagger)
    score = bigram_tagger.evaluate(brown_tagged_sents[train_len:])
    print(score)
Exemplo n.º 29
0
def train_and_save_unigram_tagger():
    train_text = brown.tagged_sents()
    regexp_tagger = RegexpTagger(
                [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                 (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                 (r'.*able$', 'JJ'),                # adjectives
                 (r'.*ness$', 'NN'),                # nouns formed from adjectives
                 (r'.*ly$', 'RB'),                  # adverbs
                 (r'.*s$', 'NNS'),                  # plural nouns
                 (r'.*ing$', 'VBG'),                # gerunds
                 (r'.*ed$', 'VBD'),                 # past tense verbs
                 (r'.*', 'NN')                      # nouns (default)
            ])

    unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger)

    output = open('../taggers/unigram_tagger.pkl', 'wb')
    dump(unigram_tagger, output, -1)
    output.close()
Exemplo n.º 30
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """

    if os.path.exists(os.path.join(os.getcwd(),
                                   r"DataBase/trained_tagger.pkl")):
        print("Trained Tagger File already Exists..")
        return

    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
Exemplo n.º 31
0
    def __init__(self, train_sents, load=False):
        if load:
            print 'Loading saved tagger...',
            self.load()
            print 'done.'
        else:
            time_start = time.time()

            print 'Training the tagger...'
            tag_counts = Counter([t for s in train_sents for w, t in s])
            default_tag = argmax(tag_counts)

            def_tgr = DefaultTagger(default_tag)
            af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr)
            uni_tgr = UnigramTagger(train_sents, backoff=af_tgr)
            bi_tgr = BigramTagger(train_sents, backoff=uni_tgr)
            tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr)
            self.tgr = tri_tgr
            print 'Done.'

            time_stop = time.time()
            print 'Training time: {0:.2f}s'.format(time_stop - time_start)
Exemplo n.º 32
0
def prepare_toolset():
    toolset = {}
    patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'),
                (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'),
                (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')]
    train_set = brown.tagged_sents(
        categories='learned', tagset='universal') + brown.tagged_sents(
            categories='news', tagset='universal') + brown.tagged_sents(
                categories='reviews', tagset='universal')
    utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN'))
    btgr = BigramTagger(train=train_set, backoff=utgr)
    ttgr = TrigramTagger(train=train_set, backoff=btgr)
    toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr)
    toolset['sw'] = stopwords.words('english')
    toolset['lr'] = WordNetLemmatizer()
    toolset['wntg'] = {
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADJ': wordnet.ADJ,
        'ADV': wordnet.ADV,
        'X': wordnet.NOUN
    }
    print('Tools Ready')
    return toolset
	"""
    return [u"%s/%s" % (t, p) for t, p in sent.pos() if not t in ["-LRB-", "-RRB-"]]


if __name__ == "__main__":
    if len(sys.argv) < 3:
        print "Usage:\n\t%s <corpus>" % sys.argv[0]
        sys.exit(-1)
        # Prepare corpus
    tagged_sents = build_tagged_sents(sys.argv[1:])
    random.shuffle(tagged_sents)
    tagged_train = tagged_sents[: len(tagged_sents) / 2]
    tagged_test = tagged_sents[len(tagged_sents) / 2 :]
    # Train unigram tagger
    print "Training unigram tagger..."
    unigram_tagger = UnigramTagger(tagged_train)
    print "\taccuracy: %f" % unigram_tagger.evaluate(tagged_test)
    # Train brill tagger
    print "Training Brill tagger..."
    templates = [
        # Context tag in a 1, 2 and 3 word window
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)),
        # Context word in a 1, 2 and 3 word window
        SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateWordsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 3)),
        # Closest tag
Exemplo n.º 34
0
    # import sys
    # sys.exit(1)

    all_words = corpus.brown.tagged_sents(tagset='universal')
    # random.shuffle(all_words)  # we shuffle it so we don't get a specific category as the test set!
    ds_length = len(all_words)
    train = all_words[int(0.2 * ds_length):]
    dev = all_words[:int(0.1 * ds_length)]
    test = all_words[int(0.1 * ds_length):int(0.2 * ds_length)]

    from nltk import UnigramTagger, AffixTagger

    unigram = UnigramTagger(train)
    affix_ugram_backoff = AffixTagger(train, backoff=unigram)
    affix = AffixTagger(train)
    unigram_affix_backoff = UnigramTagger(train, backoff=affix)
    # print "testing"
    # print affix_ugram_backoff.evaluate(test)
    # print unigram_affix_backoff.evaluate(test)
    # cutoffs = [x*0.1 for x in range(20)]
    # for c in cutoffs:
    # tagger = EntropyVotingTagger(taggers, c)
    # print "Accuracy of entropy voting = ", tagger.evaluate(test)


    affix_tagger = EntropyAffixTagger(train)
    unigram_tagger = EntropyUnigramTagger(train)
    taggers = [unigram_tagger, affix_tagger]
    tagger = EntropyVotingTagger(taggers, max_entropy=80)

    from nltk.tag import untag
Exemplo n.º 35
0
def performance(wordList):
    tagger = dict((word[0], cfd[word[0]].max()) for (word, freq) in wordList if len(cfd[word[0]]))
    if not len(tagger):
        return 0
    baselineTagger = UnigramTagger(model=tagger, backoff=DefaultTagger("NN"))
    return baselineTagger.evaluate(taggedSents)
Exemplo n.º 36
0
from nltk import UnigramTagger
from nltk.corpus import treebank

from tag_util import word_tag_model

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = UnigramTagger(model=model)

test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))
Exemplo n.º 37
0
	def createModel(self):

		
		model_name=None
		try:
			unigrams=self.buildUnigrams()
			
			N=len(self.corpusSents)
			toTraining=round(self.training_portion*N)
			
			#logging.info("Sentencias totales:" + str(N))

			training=self.corpusSents[:toTraining]
			test=self.corpusSents[toTraining:]
			
			post_patterns=[]

			for regex,post in self.regex_list:
				try:
					regex=regex.decode('utf-8')
				except:
					pass
				
				post_patterns.append((regex,post))


			
			for regex,post in self.config.items('postaggers.regex'):
				post_patterns.append((regex.decode('utf-8'),post))

		
			regexpTagger  = RegexpTagger(post_patterns)
			unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger)	
			bigramTagger= BigramTagger(training, backoff=unigramTagger) 
			trigramTagger = TrigramTagger(training, backoff=bigramTagger)
			NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger)

			print("Sentencias de entrenamiento para n-taggers:" + str(len(training)))
			print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams)))
			print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams)))
			print("Sentencias para testing:" + str(len(test)))
			print("Expresiones regulares para el Tagger:")
			
			for post_regex in post_patterns:
				print post_regex
				
		
			if self.training_portion!=1:
		
				score_ut=unigramTagger.evaluate(test)
				score_bt=bigramTagger.evaluate(test)-0.002
				score_tt=trigramTagger.evaluate(test)
				score_nt=NTagger.evaluate(test)

			

				scores=[score_ut,score_bt,score_tt,score_nt]
				tagger_names=["uTagger","biTagger","triTagger","NTagger"]
				taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger]

				bestTagger_index= scores.index(max(scores))
				best_msg=max(scores),tagger_names[bestTagger_index]
			
		
			fname=self.taggers_path + tagger_names[bestTagger_index]
			if os.path.isfile(fname+self.tagger_extension_file):
				fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file
			else:
				fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file
			
			model=taggers[bestTagger_index]

			f = open(fname,'wb')
			pickle.dump(model, f)
			f.close()
			
			print ("Guardando el tagger :" + fname)
			#logging.info("Guardando el mejor tagger :" + fname)
			
			model_name=fname
			
		except Exception,e:
			print "ERRPR EN POS TAGGER GENERATOR:",str(e)
			pdb.set_trace()
def treeSentenceToTuples(sent):
	"""
	:param sent: a Tree representing a sentence
	:type sent: nltk.tree.Tree
	"""
	return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]]

if __name__ == "__main__":
	if len(sys.argv) < 3:
		print "Usage:\n\t%s <corpus>" % sys.argv[0]
		sys.exit(-1)
	training = []
	testing = []
	lineIdx = 0
	for fname in sys.argv[1:]:
		fin = codecs.open(fname, "r", "utf-8")
		for line in fin:
			lineIdx += 1
			t = Tree.parse(line)
			if lineIdx % 2 == 0:
				training.append( t.pos() )
			else:
				testing.append( t.pos() )
		fin.close()
	# Train tagger
	unigram_tagger = UnigramTagger(training)
	# Evaluate
	print "Accuracy: %f" % unigram_tagger.evaluate(testing)