""" from docopt import docopt import pickle from tagging.ancora import SimpleAncoraCorpusReader from tagging.baseline import BaselineTagger, BadBaselineTagger from tagging.hmm import MLHMM models = {'badbase': BadBaselineTagger, 'base': BaselineTagger, 'mlhmm': MLHMM} if __name__ == '__main__': opts = docopt(__doc__) # load the data files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/', files) sents = corpus.tagged_sents() # train the model model_class = models[opts['-m']] print(opts['-m']) if opts['-m'] == 'mlhmm': model = model_class(int(opts['-n']), sents) else: model = model_class(sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()
return len(self.tags()) def tag_freq(self, t): """Frequency of tag t.""" return self.tagsAppearances[t] def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self.tagDict[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader(opts['<path>']) sents = corpus.tagged_sents() # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('')
def tag_freq(self, t): """Frequency of tag t.""" # WORK HERE!! return self.tags_count[t] def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self.words_per_tag[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/') sents = list(corpus.tagged_sents()) # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('')
== ground_truth[unknown]).sum() / unknown.sum() * 100 print("Accuracy for unknown words: {:2.2f}%".format(unknown_acc)) if show_confusion_matrix: top = 5 top_tags = np.argsort(-counts)[:top] labels = labels[top_tags] cm = cm.astype('float') / cm.sum() cm = cm[top_tags][:, top_tags] plot_confusion_matrix(cm, labels) if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(ANCORA_CORPUS_PATH, files) sents = list(corpus.tagged_sents()) # tag and evaluate print_results(model, sents, opts['-c'])
print('\b' * width + msg, end='') sys.stdout.flush() if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(opts['-c'], files) sents = list(corpus.tagged_sents()) # tag and evaluate hits, total = 0, 0 unk_hits, unk_total = 0, 0 error_count = defaultdict(lambda: defaultdict(int)) error_sents = defaultdict(lambda: defaultdict(set)) n = len(sents) for i, sent in enumerate(sents): word_sent, gold_tag_sent = zip(*sent) model_tag_sent = model.tag(word_sent) assert len(model_tag_sent) == len(gold_tag_sent), i # global score hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
print('\b' * width + msg, end='') sys.stdout.flush() if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora-dataset/ancora-3.0.1es/', files) sents = list(corpus.tagged_sents()) # tag and evaluate # WORK HERE!! hits = 0 total = 0 knw_hits, knw_total = 0, 0 unk_hits, unk_total = 0, 0 err_count = defaultdict(lambda: defaultdict(int)) err_sent = defaultdict(lambda: defaultdict(set)) n = len(sents)
return len(self.tags()) def tag_freq(self, t): """Frequency of tag t.""" return sum([count for tag, count in self._tag_word_dict[t].items()]) def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self._tag_word_dict[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader('./ancora-dataset/ancora-3.0.1es/') sents = corpus.tagged_sents() # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('')
"""Frequency of tag t.""" return(self._countTag[t]) def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self._tcount[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data #corpus = SimpleAncoraCorpusReader(opts['-c']) #No se porque no me esta cargando asi # corpus = SimpleAncoraCorpusReader("ancora-3.0.1es") corpus = SimpleAncoraCorpusReader(opts['<path>']) #por la documentacion que encontre, lo cambie a '<path>' para no hardcodearlo sents = corpus.tagged_sents() count = defaultdict(int) # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count))
def tag_freq(self, t): """Frequency of tag t.""" # return self.tag_dict[t] / self.tokenCount return self.freq_tag_dict[t] def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self.tag_dict[t]) #Esto corre el programa if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader( opts['<path>']) #Modified -c with path. Otherwise it doesn't work sents = corpus.tagged_sents() # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('') print('Example of word frequency')
return len(self._tags) def tag_freq(self, t): """Frequency of tag t.""" return self._tag_freq[t] def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self._tcount[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader('corpus/ancora-3.0.1es/') sents = corpus.tagged_sents() # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('')
from tagging.classifier import * import time models = { 'badbase': BadBaselineTagger, 'base': BaselineTagger, 'classifier': ClassifierTagger, } path = 'ancora-3.0.1es' #Path to corpus filename = 'classifierLR' #Name of pickle selectedModel = 'classifier' # load the data files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(path, files) sents = list(corpus.tagged_sents()) # train the model model_class = models[selectedModel] start = time.time() model = model_class(sents, 'lr') end = time.time() print(end - start) print((end - start) / 60) #sent = 'El gato come pescado .'.split() #model.tag(sent) # save it