예제 #1
0
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
         L1=1e-6):
    parser_cfg = dict(locals())
    tagger_cfg = dict(locals())
    entity_cfg = dict(locals())

    lang = spacy.util.get_lang_class(language)

    parser_cfg['features'] = lang.Defaults.parser_features
    entity_cfg['features'] = lang.Defaults.entity_features

    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        gold_dev = list(read_json_file(dev_loc))
        if n_sents > 0:
            gold_train = gold_train[:n_sents]
        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
              n_iter=n_iter)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #2
0
파일: train.py 프로젝트: paolodedios/spaCy
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
    parser_cfg = dict(locals())
    tagger_cfg = dict(locals())
    entity_cfg = dict(locals())

    lang = spacy.util.get_lang_class(language)
    
    parser_cfg['features'] = lang.Defaults.parser_features
    entity_cfg['features'] = lang.Defaults.entity_features

    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        gold_dev = list(read_json_file(dev_loc))
        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
              n_iter=n_iter)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #3
0
파일: nn_train.py 프로젝트: Arttii/spaCy
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
         nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
         eta=0.1, mu=0.9, eval_only=False):




    gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))

    nlp = train(English, gold_train, model_dir,
               feat_set='embed',
               eta=eta, mu=mu,
               nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
               n_sents=n_sents, n_iter=n_iter,
               verbose=verbose)

    scorer = evaluate(nlp, list(read_json_file(dev_loc)))
    
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f
예제 #4
0
파일: train.py 프로젝트: domsooch/spaCy
def main(train_loc,
         dev_loc,
         model_dir,
         n_sents=0,
         n_iter=15,
         out_loc="",
         verbose=False,
         debug=False,
         corruption_level=0.0,
         gold_preproc=False,
         beam_width=1,
         eval_only=False,
         use_orig_arc_eager=False):
    if use_orig_arc_eager:
        English.ParserTransitionSystem = TreeArcEager
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English,
              gold_train,
              model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc,
              n_sents=n_sents,
              corruption_level=corruption_level,
              n_iter=n_iter,
              beam_width=beam_width,
              verbose=verbose,
              use_orig_arc_eager=use_orig_arc_eager)
    #if out_loc:
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English,
                      list(read_json_file(dev_loc)),
                      model_dir,
                      gold_preproc=gold_preproc,
                      verbose=verbose,
                      beam_width=beam_width)
    print 'TOK', scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f
예제 #5
0
파일: train.py 프로젝트: anukat2015/spaCy
def main(language,
         train_loc,
         dev_loc,
         model_dir,
         n_sents=0,
         n_iter=15,
         out_loc="",
         verbose=False,
         debug=False,
         corruption_level=0.0,
         gold_preproc=False,
         eval_only=False,
         pseudoprojective=False):
    lang = {'en': English, 'de': German}.get(language)

    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(lang,
              gold_train,
              model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc,
              n_sents=n_sents,
              corruption_level=corruption_level,
              n_iter=n_iter,
              verbose=verbose,
              pseudoprojective=pseudoprojective)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang,
                      list(read_json_file(dev_loc)),
                      model_dir,
                      gold_preproc=gold_preproc,
                      verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #6
0
def main(train_loc,
         dev_loc,
         model_dir,
         n_sents=0,
         n_iter=15,
         verbose=False,
         nv_word=10,
         nv_tag=10,
         nv_label=10,
         nv_hidden=10,
         eta=0.1,
         mu=0.9,
         eval_only=False):

    gold_train = list(read_json_file(train_loc,
                                     lambda doc: 'wsj' in doc['id']))

    nlp = train(English,
                gold_train,
                model_dir,
                feat_set='embed',
                eta=eta,
                mu=mu,
                nv_word=nv_word,
                nv_tag=nv_tag,
                nv_label=nv_label,
                nv_hidden=nv_hidden,
                n_sents=n_sents,
                n_iter=n_iter,
                verbose=verbose)

    scorer = evaluate(nlp, list(read_json_file(dev_loc)))

    print 'TOK', 100 - scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f
예제 #7
0
파일: train.py 프로젝트: michigan-com/spaCy
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose)
    #if out_loc:
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #8
0
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(English, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose)
    #if out_loc:
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #9
0
파일: train.py 프로젝트: Develer/spaCy
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
    lang = {'en':English, 'de':German}.get(language)

    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        train(lang, gold_train, model_dir,
              feat_set='basic' if not debug else 'debug',
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose,pseudoprojective=pseudoprojective)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)
예제 #10
0
파일: train.py 프로젝트: nournia/spaCy
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, "w", "utf8")
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)
            for t in tokens:
                out_file.write("%s\t%s\t%s\t%s\n" % (t.orth_, t.tag_, t.head.orth_, t.dep_))
    return scorer
예제 #11
0
파일: train.py 프로젝트: domsooch/spaCy
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)
            for t in tokens:
                out_file.write('%s\t%s\t%s\t%s\n' %
                               (t.orth_, t.tag_, t.head.orth_, t.dep_))
    return scorer
예제 #12
0
파일: train.py 프로젝트: anukat2015/spaCy
def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = io.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text)
            #gold = GoldParse(tokens, annot_tuples)
            #scorer.score(tokens, gold, verbose=False)
            for sent in tokens.sents:
                for t in sent:
                    if not t.is_space:
                        out_file.write(
                            '%d\t%s\t%s\t%s\t%s\n' %
                            (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_))
                out_file.write('\n')
예제 #13
0
파일: train.py 프로젝트: Develer/spaCy
def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = io.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text)
            #gold = GoldParse(tokens, annot_tuples)
            #scorer.score(tokens, gold, verbose=False)
            for sent in tokens.sents:
                for t in sent:
                    if not t.is_space:
                        out_file.write(
                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
                        )
                out_file.write('\n')
예제 #14
0
Developed and tested for spaCy 2.0.6. Updated for v2.2.2
"""
import random
import plac
import spacy
import os.path
from spacy.tokens import Doc
from spacy.gold import read_json_file, GoldParse

random.seed(0)

PWD = os.path.dirname(__file__)

TRAIN_DATA = list(
    read_json_file(
        os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))


def get_position_label(i, words, tags, heads, labels, ents):
    """Return labels indicating the position of the word in the document.
    """
    if len(words) < 20:
        return "short-doc"
    elif i == 0:
        return "first-word"
    elif i < 10:
        return "early-word"
    elif i < 20:
        return "mid-word"
    elif i == len(words) - 1:
        return "last-word"
예제 #15
0
The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used.

Developed and tested for spaCy 2.0.6
'''
import random
import plac
import spacy
import os.path
from spacy.gold import read_json_file, GoldParse

random.seed(0)

PWD = os.path.dirname(__file__)

TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json')))


def get_position_label(i, words, tags, heads, labels, ents):
    '''Return labels indicating the position of the word in the document.
    '''
    if len(words) < 20:
        return 'short-doc'
    elif i == 0:
        return 'first-word'
    elif i < 10:
        return 'early-word'
    elif i < 20:
        return 'mid-word'
    elif i == len(words) - 1:
        return 'last-word'
The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used.

Developed and tested for spaCy 2.0.6
'''
import random
import plac
import spacy
import os.path
from spacy.gold import read_json_file, GoldParse

random.seed(0)

PWD = os.path.dirname(__file__)

TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json')))



def get_position_label(i, words, tags, heads, labels, ents):
    '''Return labels indicating the position of the word in the document.
    '''
    if len(words) < 20:
        return 'short-doc'
    elif i == 0:
        return 'first-word'
    elif i < 10:
        return 'early-word'
    elif i < 20:
        return 'mid-word'
    elif i == len(words)-1: