def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False, L1=1e-6): parser_cfg = dict(locals()) tagger_cfg = dict(locals()) entity_cfg = dict(locals()) lang = spacy.util.get_lang_class(language) parser_cfg['features'] = lang.Defaults.parser_features entity_cfg['features'] = lang.Defaults.entity_features if not eval_only: gold_train = list(read_json_file(train_loc)) gold_dev = list(read_json_file(dev_loc)) if n_sents > 0: gold_train = gold_train[:n_sents] train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_iter=n_iter) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) print('NER F', scorer.ents_f)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): parser_cfg = dict(locals()) tagger_cfg = dict(locals()) entity_cfg = dict(locals()) lang = spacy.util.get_lang_class(language) parser_cfg['features'] = lang.Defaults.parser_features entity_cfg['features'] = lang.Defaults.entity_features if not eval_only: gold_train = list(read_json_file(train_loc)) gold_dev = list(read_json_file(dev_loc)) train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_iter=n_iter) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) print('NER F', scorer.ents_f)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, eta=0.1, mu=0.9, eval_only=False): gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) nlp = train(English, gold_train, model_dir, feat_set='embed', eta=eta, mu=mu, nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, n_sents=n_sents, n_iter=n_iter, verbose=verbose) scorer = evaluate(nlp, list(read_json_file(dev_loc))) print 'TOK', 100-scorer.token_acc print 'POS', scorer.tags_acc print 'UAS', scorer.uas print 'LAS', scorer.las print 'NER P', scorer.ents_p print 'NER R', scorer.ents_r print 'NER F', scorer.ents_f
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1, eval_only=False, use_orig_arc_eager=False): if use_orig_arc_eager: English.ParserTransitionSystem = TreeArcEager if not eval_only: gold_train = list(read_json_file(train_loc)) train(English, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, beam_width=beam_width, verbose=verbose, use_orig_arc_eager=use_orig_arc_eager) #if out_loc: # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) scorer = evaluate(English, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose, beam_width=beam_width) print 'TOK', scorer.token_acc print 'POS', scorer.tags_acc print 'UAS', scorer.uas print 'LAS', scorer.las print 'NER P', scorer.ents_p print 'NER R', scorer.ents_r print 'NER F', scorer.ents_f
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): lang = {'en': English, 'de': German}.get(language) if not eval_only: gold_train = list(read_json_file(train_loc)) train(lang, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose, pseudoprojective=pseudoprojective) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) print('NER F', scorer.ents_f)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, eta=0.1, mu=0.9, eval_only=False): gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) nlp = train(English, gold_train, model_dir, feat_set='embed', eta=eta, mu=mu, nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, n_sents=n_sents, n_iter=n_iter, verbose=verbose) scorer = evaluate(nlp, list(read_json_file(dev_loc))) print 'TOK', 100 - scorer.token_acc print 'POS', scorer.tags_acc print 'UAS', scorer.uas print 'LAS', scorer.las print 'NER P', scorer.ents_p print 'NER R', scorer.ents_r print 'NER F', scorer.ents_f
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): if not eval_only: gold_train = list(read_json_file(train_loc)) train(English, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose) #if out_loc: # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) scorer = evaluate(English, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) print('NER F', scorer.ents_f)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): lang = {'en':English, 'de':German}.get(language) if not eval_only: gold_train = list(read_json_file(train_loc)) train(lang, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose,pseudoprojective=pseudoprojective) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) print('NER F', scorer.ents_f)
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): nlp = Language(data_dir=model_dir) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width gold_tuples = read_json_file(dev_loc) scorer = Scorer() out_file = codecs.open(out_loc, "w", "utf8") for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=False) for t in tokens: out_file.write("%s\t%s\t%s\t%s\n" % (t.orth_, t.tag_, t.head.orth_, t.dep_)) return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): nlp = Language(data_dir=model_dir) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width gold_tuples = read_json_file(dev_loc) scorer = Scorer() out_file = codecs.open(out_loc, 'w', 'utf8') for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=False) for t in tokens: out_file.write('%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)) return scorer
def write_parses(Language, dev_loc, model_dir, out_loc): nlp = Language(data_dir=model_dir) gold_tuples = read_json_file(dev_loc) scorer = Scorer() out_file = io.open(out_loc, 'w', 'utf8') for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text) #gold = GoldParse(tokens, annot_tuples) #scorer.score(tokens, gold, verbose=False) for sent in tokens.sents: for t in sent: if not t.is_space: out_file.write( '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)) out_file.write('\n')
def write_parses(Language, dev_loc, model_dir, out_loc): nlp = Language(data_dir=model_dir) gold_tuples = read_json_file(dev_loc) scorer = Scorer() out_file = io.open(out_loc, 'w', 'utf8') for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.entity(tokens) nlp.parser(tokens) else: tokens = nlp(raw_text) #gold = GoldParse(tokens, annot_tuples) #scorer.score(tokens, gold, verbose=False) for sent in tokens.sents: for t in sent: if not t.is_space: out_file.write( '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_) ) out_file.write('\n')
Developed and tested for spaCy 2.0.6. Updated for v2.2.2 """ import random import plac import spacy import os.path from spacy.tokens import Doc from spacy.gold import read_json_file, GoldParse random.seed(0) PWD = os.path.dirname(__file__) TRAIN_DATA = list( read_json_file( os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json"))) def get_position_label(i, words, tags, heads, labels, ents): """Return labels indicating the position of the word in the document. """ if len(words) < 20: return "short-doc" elif i == 0: return "first-word" elif i < 10: return "early-word" elif i < 20: return "mid-word" elif i == len(words) - 1: return "last-word"
The specific example here is not necessarily a good idea --- but it shows how an arbitrary objective function for some word can be used. Developed and tested for spaCy 2.0.6 ''' import random import plac import spacy import os.path from spacy.gold import read_json_file, GoldParse random.seed(0) PWD = os.path.dirname(__file__) TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json'))) def get_position_label(i, words, tags, heads, labels, ents): '''Return labels indicating the position of the word in the document. ''' if len(words) < 20: return 'short-doc' elif i == 0: return 'first-word' elif i < 10: return 'early-word' elif i < 20: return 'mid-word' elif i == len(words) - 1: return 'last-word'
The specific example here is not necessarily a good idea --- but it shows how an arbitrary objective function for some word can be used. Developed and tested for spaCy 2.0.6 ''' import random import plac import spacy import os.path from spacy.gold import read_json_file, GoldParse random.seed(0) PWD = os.path.dirname(__file__) TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json'))) def get_position_label(i, words, tags, heads, labels, ents): '''Return labels indicating the position of the word in the document. ''' if len(words) < 20: return 'short-doc' elif i == 0: return 'first-word' elif i < 10: return 'early-word' elif i < 20: return 'mid-word' elif i == len(words)-1: