Пример #1
0
def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
    if tag_map_loc:
        with open(tag_map_loc) as file_:
            tag_map = json.loads(file_.read())
    else:
        tag_map = DEFAULT_TAG_MAP
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if tag_map:
                for tag in tags:
                    assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(15):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Пример #2
0
def main(train_loc, dev_loc, model_dir, tag_map_loc):
    with open(tag_map_loc) as file_:
        tag_map = json.loads(file_.read())
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')
    
    model_dir = pathlib.Path(model_dir)
    with (model_dir / 'deps' / 'config.json').open('w') as file_:
        json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)

    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            for tag in tags:
                assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features)
    
    for itn in range(15):
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                parser.update(doc, gold)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Пример #3
0
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Пример #4
0
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          n_sents=0,
          corruption_level=0,
          beam_width=1,
          verbose=False,
          use_orig_arc_eager=False,
          pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,
                 projectivize=pseudoprojective)
    Config.write(ner_model_dir,
                 'config',
                 features='ner',
                 seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir,
                   tagger=False,
                   parser=False,
                   entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string, pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings,
                                 BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer,
                            nlp,
                            raw_text,
                            annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" %
                                    annot_tuples)
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' %
              (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc,
               scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #5
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False, pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,projectivize=pseudoprojective)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #6
0
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))