def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): print("Load parser", model_dir) nlp = Language(path=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) nlp.entity(tokens) else: tokens = nlp(raw_text) gold = GoldParse.from_annot_tuples(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer
def main(n_iter=10): nlp = spacy.blank('en') ner = nlp.create_pipe('ner') ner.add_multitask_objective(get_position_label) nlp.add_pipe(ner) print("Create data", len(TRAIN_DATA)) optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annot_brackets in TRAIN_DATA: annotations, _ = annot_brackets doc = nlp.make_doc(text) gold = GoldParse.from_annot_tuples(doc, annotations[0]) nlp.update( [doc], # batch of texts [gold], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses.get('nn_labeller', 0.0), losses['ner']) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(n_iter=10): nlp = spacy.blank("en") ner = nlp.create_pipe("ner") ner.add_multitask_objective(get_position_label) nlp.add_pipe(ner) print(nlp.pipeline) print("Create data", len(TRAIN_DATA)) optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annot_brackets in TRAIN_DATA: for annotations, _ in annot_brackets: doc = Doc(nlp.vocab, words=annotations[1]) gold = GoldParse.from_annot_tuples(doc, annotations) nlp.update( [doc], # batch of texts [gold], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses, ) print(losses.get("nn_labeller", 0.0), losses["ner"]) # test the trained model for text, _ in TRAIN_DATA: if text is not None: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])