Пример #1
0
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 labels=ArcEager.get_labels(gold_tuples))

    nlp = Language(data_dir=model_dir,
                   tagger=False,
                   parser=False,
                   entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)

    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, _ in sents:
                if len(annot_tuples[1]) == 1:
                    continue

                score_model(scorer, nlp, None, annot_tuples, verbose=False)

                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples)

                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' %
              (itn, loss, scorer.uas, scorer.tags_acc, scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #2
0
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, n_sents=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    gold_tuples = read_docparse_file(train_loc)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tUAS\tNER F.\tTag %"
    for itn in range(n_iter):
        scorer = Scorer()
        for raw_text, segmented_text, annot_tuples in gold_tuples:
            # Eval before train
            tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)

            if gold_preproc:
                sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
            else:
                sents = [nlp.tokenizer(raw_text)]
            for tokens in sents:
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                nlp.parser.train(tokens, gold)
                if gold.ents:
                    nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)

        print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
        random.shuffle(gold_tuples)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
Пример #3
0
    def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
        dep_model_dir = path.join(model_dir, 'deps')
        pos_model_dir = path.join(model_dir, 'pos')
        if path.exists(dep_model_dir):
            shutil.rmtree(dep_model_dir)
        if path.exists(pos_model_dir):
            shutil.rmtree(pos_model_dir)
        os.mkdir(dep_model_dir)
        os.mkdir(pos_model_dir)

        Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                     labels=labels)
Пример #4
0
def train(Language,
          sentences,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(sentences))

    nlp = Language(data_dir=model_dir)

    for itn in range(n_iter):
        scorer = Scorer()
        for _1, _2, (_3, words, tags, heads, labels, ner) in sentences:
            tokens = nlp.tokenizer.tokens_from_list(words)
            assert len(words) == len(tokens) == len(heads)
            string_indices = [token.idx for token in tokens]
            heads = [string_indices[head] for head in heads]
            annot_tuples = (string_indices, words, tags, heads, labels, ner)
            nlp.tagger.tag_from_strings(tokens, tags)
            # Eval before train
            nlp.parser(tokens)
            scorer.score(tokens,
                         GoldParse(tokens, annot_tuples),
                         verbose=False)
            # Make fresh tokens, and train
            tokens = nlp.tokenizer.tokens_from_list(words)
            nlp.tagger.tag_from_strings(tokens, tags)
            try:
                nlp.parser.train(tokens, GoldParse(tokens, annot_tuples))
            except AssertionError:
                continue
        print '%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las)
        random.shuffle(sentences)
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
Пример #5
0
def train(Language,
          paragraphs,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    left_labels, right_labels = get_labels(paragraphs)
    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 left_labels=left_labels,
                 right_labels=right_labels)

    nlp = Language()

    for itn in range(n_iter):
        heads_corr = 0
        pos_corr = 0
        n_tokens = 0
        for tokens, tag_strs, heads, labels in iter_data(
                paragraphs, nlp.tokenizer, gold_preproc=gold_preproc):
            nlp.tagger(tokens)
            try:
                heads_corr += nlp.parser.train_sent(tokens,
                                                    heads,
                                                    labels,
                                                    force_gold=force_gold)
            except OracleError:
                continue
            pos_corr += nlp.tagger.train(tokens, tag_strs)
            n_tokens += len(tokens)
        acc = float(heads_corr) / n_tokens
        pos_acc = float(pos_corr) / n_tokens
        print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
        random.shuffle(paragraphs)
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    return acc
Пример #6
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples))

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
 
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, _ in sents:
                if len(annot_tuples[1]) == 1:
                    continue

                score_model(scorer, nlp, None, annot_tuples, verbose=False)

                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
 
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                             scorer.tags_acc, scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #7
0
    def from_dir(cls, tag_map, model_dir):
        vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)

        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
        return cls(vocab, tokenizer, tagger, parser)
Пример #8
0
    def from_dir(cls, tag_map, model_dir):
        vocab = Vocab(tag_map=tag_map,
                      get_lex_attr=Language.default_lex_attrs())
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)

        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings,
                                 ArcEager)
        return cls(vocab, tokenizer, tagger, parser)
Пример #9
0
    def setup_model_dir(model_dir,
                        labels,
                        templates,
                        feat_set='basic',
                        seed=0):
        dep_model_dir = path.join(model_dir, 'deps')
        pos_model_dir = path.join(model_dir, 'pos')
        if path.exists(dep_model_dir):
            shutil.rmtree(dep_model_dir)
        if path.exists(pos_model_dir):
            shutil.rmtree(pos_model_dir)
        os.mkdir(dep_model_dir)
        os.mkdir(pos_model_dir)

        Config.write(dep_model_dir,
                     'config',
                     features=feat_set,
                     seed=seed,
                     labels=labels)
Пример #10
0
def train(Language, sentences, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(sentences))

    nlp = Language(data_dir=model_dir)
    
    for itn in range(n_iter):
        scorer = Scorer()
        for _1, _2, (_3, words, tags, heads, labels, ner) in sentences:
            tokens = nlp.tokenizer.tokens_from_list(words)
            assert len(words) == len(tokens) == len(heads)
            string_indices = [token.idx for token in tokens]
            heads = [string_indices[head] for head in heads]
            annot_tuples = (string_indices, words, tags, heads, labels, ner)
            nlp.tagger.tag_from_strings(tokens, tags)
            # Eval before train
            nlp.parser(tokens)
            scorer.score(tokens, GoldParse(tokens, annot_tuples), verbose=False)
            # Make fresh tokens, and train
            tokens = nlp.tokenizer.tokens_from_list(words)
            nlp.tagger.tag_from_strings(tokens, tags)
            try:
                nlp.parser.train(tokens, GoldParse(tokens, annot_tuples))
            except AssertionError:
                continue
	print '%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las)
        random.shuffle(sentences)
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
Пример #11
0
def train(Language, sents, model_dir, n_iter=15, feat_set=u'basic', seed=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    left_labels, right_labels = get_labels(sents)
    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 left_labels=left_labels,
                 right_labels=right_labels)

    nlp = Language()

    for itn in range(n_iter):
        heads_corr = 0
        pos_corr = 0
        n_tokens = 0
        for words, heads, labels, tags in sents:
            tags = [nlp.tagger.tag_names.index(tag) for tag in tags]
            tokens = nlp.tokenizer.tokens_from_list(words)
            nlp.tagger(tokens)
            heads_corr += nlp.parser.train_sent(tokens, heads, labels)
            pos_corr += nlp.tagger.train(tokens, tags)
            n_tokens += len(tokens)
        acc = float(heads_corr) / n_tokens
        pos_acc = float(pos_corr) / n_tokens
        print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
        random.shuffle(sents)
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    #nlp.parser.model.dump(path.join(dep_model_dir, 'model'), freq_thresh=0)
    return acc
Пример #12
0
def train(Language, paragraphs, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    left_labels, right_labels = get_labels(paragraphs)
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 left_labels=left_labels, right_labels=right_labels)

    nlp = Language()
    
    for itn in range(n_iter):
        heads_corr = 0
        pos_corr = 0
        n_tokens = 0
        for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
                                                         gold_preproc=gold_preproc):
            nlp.tagger(tokens)
            try:
                heads_corr += nlp.parser.train_sent(tokens, heads, labels, force_gold=force_gold)
            except OracleError:
                continue
            pos_corr += nlp.tagger.train(tokens, tag_strs)
            n_tokens += len(tokens)
        acc = float(heads_corr) / n_tokens
        pos_acc = float(pos_corr) / n_tokens
        print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
        random.shuffle(paragraphs)
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    return acc
Пример #13
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #14
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir)

    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    nlp.end_training()
Пример #15
0
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          eta=0.01,
          mu=0.9,
          nv_hidden=100,
          nv_word=10,
          nv_tag=10,
          nv_label=10,
          seed=0,
          n_sents=0,
          verbose=False):

    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    Config.write(
        dep_model_dir,
        'config',
        seed=seed,
        templates=tuple(),
        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
        vector_lengths=(nv_word, nv_tag, nv_label),
        hidden_nodes=nv_hidden,
        eta=eta,
        mu=mu)

    # Bake-in hyper-parameters
    optimizer = lambda loss, params: rms_prop(
        loss, params, eta=eta, rho=rho, eps=eps)
    nlp = Language(data_dir=model_dir)
    n_classes = nlp.parser.model.n_classes
    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
    nlp.parser.model = TheanoModel(n_classes, input_spec, train, predict,
                                   model_loc)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
    log_loc = path.join(model_dir, 'job.log')
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, annot_tuples)
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                assert gold.is_projective
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (
            itn, loss, scorer.uas, scorer.tags_acc, scorer.token_acc)
        print logline
        with open(log_loc, 'aw') as file_:
            file_.write(logline + '\n')
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
    return nlp
Пример #16
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False, pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,projectivize=pseudoprojective)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #17
0
def train(Language,
          train_loc,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          n_sents=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
                    pos_model_dir)

    gold_tuples = read_docparse_file(train_loc)

    Config.write(
        dep_model_dir,
        'config',
        features=feat_set,
        seed=seed,
        labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(
        ner_model_dir,
        'config',
        features='ner',
        seed=seed,
        labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tUAS\tNER F.\tTag %"
    for itn in range(n_iter):
        scorer = Scorer()
        for raw_text, segmented_text, annot_tuples in gold_tuples:
            # Eval before train
            tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)

            if gold_preproc:
                sents = [
                    nlp.tokenizer.tokens_from_list(s) for s in segmented_text
                ]
            else:
                sents = [nlp.tokenizer(raw_text)]
            for tokens in sents:
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                nlp.parser.train(tokens, gold)
                if gold.ents:
                    nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)

        print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f,
                                         scorer.tags_acc)
        random.shuffle(gold_tuples)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
Пример #18
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
          seed=0, n_sents=0,  verbose=False):

    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config',
        seed=seed,
        templates=tuple(),
        labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
        vector_lengths=(nv_word, nv_tag, nv_label),
        hidden_nodes=nv_hidden,
        eta=eta,
        mu=mu
    )
  
    # Bake-in hyper-parameters
    optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
    nlp = Language(data_dir=model_dir)
    n_classes = nlp.parser.model.n_classes
    train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
    nlp.parser.model = TheanoModel(n_classes, input_spec, train,
                                   predict, model_loc)
 
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
    log_loc = path.join(model_dir, 'job.log')
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, annot_tuples)
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                assert gold.is_projective
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                                 scorer.tags_acc,
                                                 scorer.token_acc)
        print logline
        with open(log_loc, 'aw') as file_:
            file_.write(logline + '\n')
    nlp.parser.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
    return nlp