def train_tagger(corpus_name, corpus): """ Train the taggers and saves them Args: corpus_name: name of the corpus used to create the tagger corpus: corpus for creating the tagger """ #List of n-gram taggers names complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES] # Training UnigramTagger tagger1 = UnigramTagger(corpus) utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH) print "UnigramTagger trained with", corpus_name # Training BigramTagger tagger2 = BigramTagger(corpus) utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH) print "BigramTagger trained with", corpus_name # Training TrigramTagger tagger3 = TrigramTagger(corpus) utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH) print "TrigramTagger trained with", corpus_name
def no_backoff_taggers(test, train, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers without backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # unigram tagger uni_tagger = UnigramTagger(train) # bigram tagger bi_tagger = BigramTagger(train) # trigram tagger tri_tagger = TrigramTagger(train) info(uni_tagger) uni_score = uni_tagger.evaluate(test) print('accuracy score: {}\n'.format(uni_score)) info(bi_tagger) bi_score = bi_tagger.evaluate(test) print('accuracy score: {}\n'.format(bi_score)) info(tri_tagger) tri_score = tri_tagger.evaluate(test) print('accuracy score: {}\n'.format(tri_score))
def __init__(self, mode, train_sents): if mode == TRIGRAM: self.tagger = UnigramTagger(train_sents) self.tagger = BigramTagger(train_sents, backoff=self.tagger) self.tagger = TrigramTagger(train_sents, backoff=self.tagger) elif HDM: self.tagger = HiddenMarkovModelTagger.train(train_sents)
def get_pos_tagger(): from nltk.corpus import brown regexp_tagger = nltk.RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents() unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = nltk.RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger) return main_tagger
def _model_definition(self) -> UnigramTagger: """Function to define and compile the model. Returns: Model object. """ t0 = DefaultTagger('NOUN') return UnigramTagger([[(".", "PUNCT")]], backoff=t0)
def lookupTagger(r, c): # r = range, c = corpus if (c == "brown"): fDist = ConditionalFreqDist(brownTW) freqDist = FreqDist(brown.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger if (c == "chat"): fDist = ConditionalFreqDist(chatTW) freqDist = FreqDist(chat.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger
def TrainTaggers(training, testing): global results Unigram = UnigramTagger(training, backoff = default) print('unigram trained') Bigram = BigramTagger(training, backoff = Unigram) print('bigram trained') Trigram = TrigramTagger(training, backoff = Bigram) print('trigram trained') results += [Trigram.evaluate(testing)]
def __init__(self, train_sents, to_detect_list, n_gram=1): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] self.tagger = UnigramTagger(train_data) if n_gram > 1: self.tagger = BigramTagger(train_data, backoff=self.tagger) if n_gram > 2: self.tagger = TrigramTagger(train_data, backoff=self.tagger) self.to_detect_list = to_detect_list
def train(self, model_path): corpus = [[(token.lower(), tag) for token, tag in sent] for sent in CORPUS] unigram_tagger = UnigramTagger(corpus, backoff=DefaultTagger('UNK')) bigram_tagger = BigramTagger(corpus, backoff=unigram_tagger) with open(model_path, "wb") as model_file: pickle.dump(bigram_tagger, model_file)
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags) result = baseline_tagger.evaluate(test_set) return result
def __init__(self, train_sents): """Show parameters. train_sents: trained sentences which have already been tagged. using Brown, conll2000, and TreeBank corpus. """ t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2)
def pos_tag(self): tokenize_obj = NLTKTokenize(self.options) res = tokenize_obj.tokenize() tokens = res['result'] tags = [] # Performs Bigram / Unigram / Regex Tagging if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: trainer = self.options['train'] if self.options.get( 'train') in TRAINERS else DEFAULT_TRAIN train = brown.tagged_sents(categories=trainer) # Create your custom regex tagging pattern here regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) current = os.path.dirname(os.path.abspath(__file__)) # Unigram tag training data load / dump pickle pkl_name = current + '/trained/unigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: unigram_tag = load(pkl) else: unigram_tag = UnigramTagger(train, backoff=regex_tag) with open(pkl_name, 'wb') as pkl: dump(unigram_tag, pkl, -1) # Bigram tag training data load / dump pickle if self.options['tagger'] == 'bigram': pkl_name = current + '/trained/bigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: bigram_tag = load(pkl) else: bigram_tag = BigramTagger(train, backoff=unigram_tag) with open(pkl_name, 'wb') as pkl: dump(bigram_tag, pkl, -1) tags = bigram_tag.tag(tokens) # Bigram tagging performed here elif self.options['tagger'] == 'unigram': tags = unigram_tag.tag( tokens) # Unigram tagging performed here else: tags = regex_tag.tag(tokens) # Regex tagging performed here # Performs default pos_tag elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': tags = pos_tag(tokens) return self._dump(tags)
def train_tagger(corpus_name, corpus): """ Function to train tagger. """ # Training UnigramTagger. uni_tag = UnigramTagger(corpus) save_tagger('{}_unigram.tagger'.format(corpus_name), uni_tag) # Training BigramTagger. bi_tag = BigramTagger(corpus, backoff=uni_tag) save_tagger('{}_bigram.tagger'.format(corpus_name), bi_tag) _msg = str("Tagger trained with {} using " "UnigramTagger and BigramTagger.").format(corpus_name) print(_msg, file=sys.stderr)
def create_tagger(sents,patterns=PATTERNS,maxngram=4): '''Обучение Backoff tagger на каком-либо корпусе предложений''' train = sents def_tagger = DefaultTagger('NN') re_tagger = RegexpTagger(patterns, backoff=def_tagger) uni_tagger = UnigramTagger(train, backoff=re_tagger) bi_tagger = BigramTagger(train, backoff=uni_tagger) tri_tagger = TrigramTagger(train, backoff=bi_tagger) ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger) return ngram_tagger
def backoff_taggers(test, train, save, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers with backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # UNIGRAM TAGGER WITH BACKOFF uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger) # BIGRAM TAGGER WITH BACKOFF bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff) # TRIGRAM TAGGER WITH BACKOFF tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff) info(uni_tagger_backoff) uni_backoff_score = uni_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(uni_backoff_score)) info(bi_tagger_backoff) bi_backoff_score = bi_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(bi_backoff_score)) info(tri_tagger_backoff) tri_backoff_score = tri_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(tri_backoff_score)) if not save: return accuracy_dict = {} accuracy_dict['uni'] = uni_backoff_score accuracy_dict['bi'] = bi_backoff_score accuracy_dict['tri'] = tri_backoff_score # Saving our Trigram-tagger with backoff if uni_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(uni_tagger_backoff, output, -1) elif bi_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(bi_tagger_backoff, output, -1) elif tri_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus) dump(tri_tagger_backoff, output, -1) output.close() info('saving %s...\n', tagger_file)
def get_tagger(type="StandfordPOSTagger"): if type == "Custom": brown_tagged_sents = brown.tagged_sents(categories='news', tagset='universal') t0 = DefaultTagger('NOUN') t1 = UnigramTagger(brown_tagged_sents, backoff=t0) t2 = BigramTagger(brown_tagged_sents, backoff=t1) else: t2 = StanfordPOSTagger( 'data/./models/wsj-0-18-bidirectional-distsim.tagger', '3rdparty_libs/stanford-postagger.jar') return t2
def generateTagger(): default_tagger = DefaultTagger('V') patterns = [ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine singular ] regexp_tagger = RegexpTagger(patterns, backoff=default_tagger) #train nltk.UnigramTagger using tagged sentences from cess_esp cess_tagged_sents = cess_esp.tagged_sents() combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) return combined_tagger
def __init__(self): if os.path.exists('tagger_spanish.pickle'): with open('tagger_spanish.pickle', 'r') as file_obj: self.tagger = pickle.load(file_obj) else: print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...' from nltk import UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(sents) bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word self.tagger = unigram_tagger with open('tagger_spanish.pickle', 'w') as file_obj: pickle.dump(self.tagger, file_obj) # Dump trained tagger
def task3(data, corpus): fd = FreqDist(corpus.words()) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = sorted(list(fd.items()), key=lambda x: x[1], reverse=True)[:200] most_freq_words = list(map(lambda x: x[0], most_freq_words)) likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) lookup_tagger = UnigramTagger(model=likely_tags) for str in ["brown50", "brown90", "nps50", "nps90"]: tagger = CombinedTagger(train=data["train_" + str], default=lookup_tagger, name=str) test_tagger(tagger, data)
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def __init__(self, train=None, default=None, name=None): self.name = name # As found on page 199 of the nltk book regexps = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ] self.default = default self.regex = RegexpTagger(regexps, backoff=self.default) self.unigram = UnigramTagger(train=train, backoff=self.regex) self.bigram = BigramTagger(train=train, backoff=self.unigram)
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb')) return trigram_tagger
def train_and_save_unigram_tagger(): train_text = brown.tagged_sents() regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger) output = open('../taggers/unigram_tagger.pkl', 'wb') dump(unigram_tagger, output, -1) output.close()
def lookup_tag(num_sampling): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' #Get the frequency distribution of the words fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(num_sampling) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) tagged = lookup_tagger.tag(word_tokenize(raw)) print(tagged) score = lookup_tagger.evaluate(brown_tagged_sents) print(score)
def ngram_tag_with_backoff(): fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(1000000) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) #With Backoff train_len = int(len(brown_tagged_sents) * 0.9) print(brown_tagged_sents[train_len:]) bigram_tagger = BigramTagger(brown_tagged_sents[:train_len], backoff=lookup_tagger) score = bigram_tagger.evaluate(brown_tagged_sents[train_len:]) print(score)
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ if os.path.exists(os.path.join(os.getcwd(), r"DataBase/trained_tagger.pkl")): print("Trained Tagger File already Exists..") return # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
def __init__(self, train_sents, load=False): if load: print 'Loading saved tagger...', self.load() print 'done.' else: time_start = time.time() print 'Training the tagger...' tag_counts = Counter([t for s in train_sents for w, t in s]) default_tag = argmax(tag_counts) def_tgr = DefaultTagger(default_tag) af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr) uni_tgr = UnigramTagger(train_sents, backoff=af_tgr) bi_tgr = BigramTagger(train_sents, backoff=uni_tgr) tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr) self.tgr = tri_tgr print 'Done.' time_stop = time.time() print 'Training time: {0:.2f}s'.format(time_stop - time_start)
def __init__(self, modelpath, candidates): self.modelpath = modelpath self.bus_counter = 0 with open(modelpath + 'all_highest_probs_' + str(candidates) + '.json', 'r') as f: self.candidates = json.load(f) with open( modelpath + 'inject_refcoco_refrnn_compositional_3_512_1/4eval_greedy.json', 'r' ) as f: # 'restoredmodel_refs_greedy.json') as f: restoredmodel_refs_greedy/4eval_greedy self.refs = json.load(f) self.words_that_are_names = list() with open("./noun_list_long.txt", 'r') as f: for row in f.readlines(): self.words_that_are_names.append(row.strip()) self.unigram_tagger = UnigramTagger(brown.tagged_sents()) self.zero_shot_refs = defaultdict() self.non_noun_counter = 0 self.baseline_top_1 = defaultdict() self.baseline_top_5 = defaultdict() self.baseline_top_10 = defaultdict()
def train(self, corpus: Corpus, evaluate: bool = True, config: dict = None) -> Union[None, Dict[str, Dict[str, float]]]: """Train method. Args: corpus: Corpus to train model. evaluate: Flag to return evaluation of the model. config: Training config dict (not used for this model). Returns: Model evaluation metrics. """ if self.model is None: self._model_definition() self.model = UnigramTagger(corpus.train.sentences, backoff=DefaultTagger('NOUN')) if evaluate: return self.evaluate(corpus) return None
def prepare_toolset(): toolset = {} patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'), (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'), (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')] train_set = brown.tagged_sents( categories='learned', tagset='universal') + brown.tagged_sents( categories='news', tagset='universal') + brown.tagged_sents( categories='reviews', tagset='universal') utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN')) btgr = BigramTagger(train=train_set, backoff=utgr) ttgr = TrigramTagger(train=train_set, backoff=btgr) toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr) toolset['sw'] = stopwords.words('english') toolset['lr'] = WordNetLemmatizer() toolset['wntg'] = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV, 'X': wordnet.NOUN } print('Tools Ready') return toolset