예제 #1
0
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None, cand_preproc=None):
    print("Load parser", model_dir)
    nlp = Language(path=model_dir)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.parser(tokens)
                nlp.entity(tokens)
            else:
                tokens = nlp(raw_text)
            gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer
예제 #2
0
def main(n_iter=10):
    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            annotations, _ = annot_brackets
            doc = nlp.make_doc(text)
            gold = GoldParse.from_annot_tuples(doc, annotations[0])
            nlp.update(
                [doc],  # batch of texts
                [gold],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses.get('nn_labeller', 0.0), losses['ner'])

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
예제 #3
0
def main(n_iter=10):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)
    print(nlp.pipeline)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            for annotations, _ in annot_brackets:
                doc = Doc(nlp.vocab, words=annotations[1])
                gold = GoldParse.from_annot_tuples(doc, annotations)
                nlp.update(
                    [doc],  # batch of texts
                    [gold],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses,
                )
        print(losses.get("nn_labeller", 0.0), losses["ner"])

    # test the trained model
    for text, _ in TRAIN_DATA:
        if text is not None:
            doc = nlp(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(n_iter=10):
    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            annotations, _ = annot_brackets
            doc = nlp.make_doc(text)
            gold = GoldParse.from_annot_tuples(doc, annotations[0])
            nlp.update(
                [doc],  # batch of texts
                [gold],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses.get('nn_labeller', 0.0), losses['ner'])

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])