sys.stdout.flush() if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # Load the data files = '3LB-CAST/.*\.tbf\.xml' PATH = "./../../ancora-3.0.1es" corpus = SimpleAncoraCorpusReader(PATH, files) sents = list(corpus.tagged_sents()) # Tag hits = 0 total = 0 # Hits Palabras conocidas hits_known_word = 0 total_known_word = 0 # Hits Palabras desconocidas hits_unknown_word = 0 total_unknown_word = 0 # Para Matriz de Confusion
print('\b' * width + msg, end='') sys.stdout.flush() if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/', files) sents = list(corpus.tagged_sents()) # tag hits, total = 0, 0 n = len(sents) for i, sent in enumerate(sents): word_sent, gold_tag_sent = zip(*sent) model_tag_sent = model.tag(word_sent) assert len(model_tag_sent) == len(gold_tag_sent), i # global score hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)] hits += sum(hits_sent) total += len(sent)
if __name__ == '__main__': opts = docopt(__doc__) # load the model print("\nLoading the model...") filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() print("Model type: %s" % type(model)) # load the data print("Loading corpus data...") files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files) sents = list(corpus.tagged_sents()) # compute statistics print("Computing results...") # Compute Accuracy # Global accuracy of the model (percentage of right tagging) acc, hits, total = 0.0, 0, 0 # Accuracy over known(k) and unknowns(u) words for the model hits_k, total_k, hits_u, total_u = 0, 0, 0, 0 y_true, y_pred = [], [] # Data for Confusion Matrix tagset = set() for t_sent in sents: for _, tag in t_sent:
print('\b' * width + msg, end='') sys.stdout.flush() if __name__ == '__main__': opts = docopt(__doc__) print('Loading model...') filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() print('Loading corpus...') files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/', files) parsed_sents = list(corpus.parsed_sents()) print('Parsing...') hits, total_gold, total_model = 0, 0, 0 n = len(parsed_sents) format_str = '{:3.1f}% ({}/{}) (P={:2.2f}%, R={:2.2f}%, F1={:2.2f}%)' progress(format_str.format(0.0, 0, n, 0.0, 0.0, 0.0)) for i, gold_parsed_sent in enumerate(parsed_sents): tagged_sent = gold_parsed_sent.pos() # parse model_parsed_sent = model.parse(tagged_sent) # compute labeled scores gold_spans = spans(gold_parsed_sent, unary=False)
models = { 'flat': Flat, 'rbranch': RBranch, 'lbranch': LBranch, 'upcfg': UPCFG } if __name__ == '__main__': opts = docopt(__doc__) print('Loading corpus ...') PATH = "./../../ancora-3.0.1es" files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(PATH, files) print('Training model ...') # x = list(corpus.parsed_sents())[:10] m = opts['-m'] # Modelo Elegido n = opts['-n'] # Orden Markovizacion Horizontal if (n is not None) and (m == "upcfg"): model = models[opts['-m']](corpus.parsed_sents(), horzMarkov=int(n)) else: model = models[opts['-m']](corpus.parsed_sents()) # model = models[opts['-m']](corpus.parsed_sents()) # x = corpus.parsed_sents() # model = models[opts['-m']](x) print('Saving ...') filename = opts['-o']
from docopt import docopt import pickle from corpus.ancora import SimpleAncoraCorpusReader from parsing.baselines import Flat, RBranch, LBranch models = { 'flat': Flat, 'rbranch': RBranch, 'lbranch': LBranch, } if __name__ == '__main__': opts = docopt(__doc__) print('Loading corpus...') files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/', files) print('Training model...') model = models[opts['-m']](corpus.parsed_sents()) print('Saving...') filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()
models = { 'flat': Flat, 'rbranch': RBranch, 'lbranch': LBranch, 'upcfg': UPCFG } if __name__ == '__main__': opts = docopt(__doc__) print('\nLoading corpus...') files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files) print('Training model...') om, on = opts['-m'], opts['-n'] if om == 'upcfg': n = None if on is None else int(on) print('UPCFG model selected n={}.'.format(n)) model = models[om](corpus.parsed_sents(), horzMarkov=n) elif om in ['flat', 'rbranch', 'lbranch']: print(om + ' model selected.') model = models[om](corpus.parsed_sents()) else: print('Bad model type.') exit() print('Saving...\n')
"""Print corpus statistics. Usage: stats.py stats.py -h | --help Options: -h --help Show this screen. """ from docopt import docopt from corpus.ancora import SimpleAncoraCorpusReader if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/') sents = list(corpus.tagged_sents()) # compute the statistics print('sents: {}'.format(len(sents)))
Options: -h --help Show this screen. """ from operator import itemgetter as elem from docopt import docopt from collections import Counter, defaultdict from corpus.ancora import SimpleAncoraCorpusReader if __name__ == '__main__': opts = docopt(__doc__) # load the data path = '/home/alangb/Escritorio/ancora-3.0.1es/' corpus = SimpleAncoraCorpusReader(path) sents = list(corpus.tagged_sents()) # compute the statistics # get words and tags words_tags = [word_tag for sent in sents for word_tag in sent] words, tags = zip(*words_tags) word_types = set(words) tag_types = set(tags) # calculate 10 most common tags common_tags = Counter(tags).most_common(10) # calculate 5 most common words # for each one of the most common tags
'ct': ClassifierTagger, } clasifiers = { 'multinomial': MultinomialNB, 'linear': LinearSVC, 'LogisticRegression': LogisticRegression } if __name__ == '__main__': opts = docopt(__doc__) # load the data files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' actual_dir = os.path.dirname(os.path.abspath(__file__)) corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files) sents = list(corpus.tagged_sents()) # train the model if opts['-n'] is not None: n = int(opts['-n']) m = opts['-m'] c = opts['-c'] wv_file = opts['-i'] b = opts['-b'] == 'y' if m == 'ct': print("Model", m, "Training") model = models[m](wv_file=wv_file, is_bin=b) model.fit(sents) # save it
if __name__ == '__main__': opts = docopt(__doc__) print('Loading model...') filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() m = opts['-m'] n = opts['-n'] print('Loading corpus...') files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('ancora-3.0.1es/', files) parsed_sents = list(corpus.parsed_sents()) if n is not None: n = int(n) parsed_sents = parsed_sents[:n] if m is not None: m = int(m) parsed_sents = [tree for tree in parsed_sents if len(tree.leaves()) <= m] print('Parsing...') hits, total_gold, total_model = 0, 0, 0 un_hits, un_total_gold, un_total_model = 0, 0, 0 n = len(parsed_sents) format_str = '{:3.1f}% ({}/{}) (P={:2.2f}%, R={:2.2f}%, F1={:2.2f}%)' progress(format_str.format(0.0, 0, n, 0.0, 0.0, 0.0))
return (2 * precision * recall) / (precision + recall) if __name__ == '__main__': opts = docopt(__doc__) print('Loading model ...') filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() print('Loading corpus ...') PATH = "./../../ancora-3.0.1es" files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(PATH, files) parsed_sents = list(corpus.parsed_sents()) # Opcion para seleccionar las primeras n oraciones n = opts["-n"] if n is not None: n = int(n) parsed_sents = parsed_sents[:n] # Opcion para seleccionar las oraciones de largo <= m m = opts["-m"] if m is not None: m = int(m) new_parsed_sents = [] for parsed_sent in parsed_sents: if len(parsed_sent.leaves()) <= m:
def evaluate(model=None, matrix='n'): ''' model -- The model trained that has been evaluated matrix -- If you want to generate the confusion matrix ('y') or not ('n') ''' start = time() if model is None: opts = docopt(__doc__) matrix = opts['-m'] == 'y' # load the model filename = opts['-i'] filename = 'Models/' + filename f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' actual_dir = os.path.dirname(os.path.abspath(__file__)) corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files) sents = list(corpus.tagged_sents()) n = len(sents) # tag hits, total = 0, 0 hits_known, hits_unknown = 0, 0 total_known, total_unknown = 0, 0 are_known = [] # confusion matrix test = [] prediction = [] for i, sent in enumerate(sents): word_sent, gold_tag_sent = zip(*sent) model_tag_sent = model.tag(word_sent).tolist() assert len(model_tag_sent) == len(gold_tag_sent), i # For confusion matrix test += list(gold_tag_sent) prediction += model_tag_sent # global score hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)] hits += sum(hits_sent) total += len(sent) total_acc = float(hits) / total # known words score for j in range(len(hits_sent)): # using the Counter method, descripted later, we have to asign # some values if are known or unknown and if are hit or not. if not model.unknown(word_sent[j]): are_known += [hits_sent[j] + 1] else: are_known += [hits_sent[j] - 2] progress('{:3.1f}% (Total: {:2.2f}%)'.format( float(i) * 100 / n, total_acc * 100)) # For eficiency we will use the Counter object from collections # library. # We redefine some things to look for them later known = 2 fail_known = 1 unknown = -1 fail_unknown = -2 # Counter creates a dictionary whose keys are known, fail_known, unknown # and fail_unknown. counter = Counter(are_known) # Now get the values that represent how many times does apears each one hits_known += counter[known] total_known += counter[known] + counter[fail_known] hits_unknown += counter[unknown] total_unknown += counter[unknown] + counter[fail_unknown] # Compute accuracy total_acc = float(hits) / total known_acc = float(hits_known) / total_known unknown_acc = float(hits_unknown) / total_unknown finish = time() - start print('') print('Total accuracy: {:2.2f}%'.format(total_acc * 100)) print('Known accuracy: {:2.2f}%'.format(known_acc * 100)) print('Unknown accuracy: {:2.2f}%'.format(unknown_acc * 100)) print('Time running: {:2.2f}seconds'.format(finish)) if matrix: matrix = confusion_matrix(test, prediction) classes = list(set(test) | set(prediction)) classes.sort() plot_confusion_matrix(matrix, classes, filename.split('.')[0] + '.png')
from docopt import docopt import pickle from corpus.ancora import SimpleAncoraCorpusReader from tagging.baseline import BaselineTagger from tagging.hmm import MLHMM from tagging.memm import MEMM if __name__ == '__main__': opts = docopt(__doc__) # load the data print("Loading corpus data...") files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files) sents = list(corpus.tagged_sents()) # order of the model m = str(opts['-m']) # train the model filename = opts['-o'] if m == "base": print("Baseline Model selected") model = BaselineTagger(tagged_sents=sents) elif m == "mlhmm": n = int(opts['-n']) print("Maximum Likelihood Hidden Markov Model selected, n=%d" % n) model = MLHMM(n=n, tagged_sents=sents, addone=True) elif m == 'memm':