Exemplo n.º 1
0
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    docs = list(itertools.chain(corpus.train, corpus.dev))
    sents, _ = tagging_utils.standoff_to_sents(docs, tokenizer, verbose=True)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[
        args.feature_extractor]
    X, y = crf_labeler.sents_to_features_and_labels(sents, feature_extractor)

    logger.info('len(X) = {}'.format(len(X)))
    logger.info('len(y) = {}'.format(len(y)))

    crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter,
                                        ignored_label='O',
                                        algorithm='lbfgs',
                                        c1=0.1,
                                        c2=0.1,
                                        max_iterations=100,
                                        all_possible_transitions=True)

    logger.info('Start learing curve computation...')
    with parallel_backend('multiprocessing'):
        # scikit-learn changed the default multiprocessing to 'loky' in 0.21. It appears that this
        # is not supported by sklearn_crfsuite. Therefore, we switch to the legacy 'multiprocessing'
        # parallel backend.
        plot_learning_curve(crf,
                            'CRF learning curve (sentences: N={})'.format(
                                len(X)),
                            X,
                            y,
                            out_dir=model_dir,
                            cv=5,
                            n_jobs=12)
    logger.info('Done...')
Exemplo n.º 2
0
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    train_sents, train_docs = tagging_utils.standoff_to_sents(corpus.train,
                                                              tokenizer,
                                                              verbose=True)
    dev_sents, dev_docs = tagging_utils.standoff_to_sents(corpus.dev,
                                                          tokenizer,
                                                          verbose=True)
    test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test,
                                                            tokenizer,
                                                            verbose=True)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[
        args.feature_extractor]
    X_train, y_train = crf_labeler.sents_to_features_and_labels(
        train_sents, feature_extractor)
    X_dev, y_dev = crf_labeler.sents_to_features_and_labels(
        dev_sents, feature_extractor)
    X_test, y_test = crf_labeler.sents_to_features_and_labels(
        test_sents, feature_extractor)

    logger.info('len(X_train) = {}'.format(len(X_train)))
    logger.info('len(y_train) = {}'.format(len(y_train)))
    logger.info('len(X_dev) = {}'.format(len(X_dev)))
    logger.info('len(X_test) = {}'.format(len(X_test)))

    X_train_combined = X_train + X_dev
    y_train_combined = y_train + y_dev

    train_indices = [-1] * len(X_train)
    dev_indices = [0] * len(X_dev)
    test_fold = train_indices + dev_indices

    labels = list(set(label for sent in y_train_combined for label in sent))
    labels.remove('O')
    logger.info('Labels: {}'.format(labels))
    f1_scorer = make_scorer(flat_f1_score, labels=labels, average='micro')

    crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter,
                                        ignored_label='O',
                                        algorithm='lbfgs',
                                        max_iterations=100,
                                        all_possible_transitions=True)

    ps = PredefinedSplit(test_fold)
    rs = RandomizedSearchCV(crf,
                            PARAM_SPACE,
                            cv=ps,
                            verbose=1,
                            n_jobs=args.n_jobs,
                            n_iter=args.n_iter,
                            scoring=f1_scorer,
                            return_train_score=True)

    logger.info('Start RandomizedSearchCV... {}'.format(crf))
    with parallel_backend('multiprocessing'):
        rs.fit(X_train_combined, y_train_combined)

    logger.info('best params: {}'.format(rs.best_params_))
    logger.info('best CV score: {}'.format(rs.best_score_))
    logger.info('model size: {:0.2f}M'.format(rs.best_estimator_.size_ /
                                              1000000))

    logger.info('Make predictions...')
    crf = rs.best_estimator_
    y_pred_train = crf.predict(X_train)
    y_pred_dev = crf.predict(X_dev)
    y_pred_test = crf.predict(X_test)

    train_utils.save_predictions(
        corpus_name=corpus.name,
        run_id=args.run_id,
        train=tagging_utils.sents_to_standoff(y_pred_train, train_docs),
        dev=tagging_utils.sents_to_standoff(y_pred_dev, dev_docs),
        test=tagging_utils.sents_to_standoff(y_pred_test, test_docs))
    _save_model_aritfacts(rs, model_dir, y_test, y_pred_test)
Exemplo n.º 3
0
        test=tagging_utils.sents_to_standoff(y_pred_test, test_docs))
    _save_model_aritfacts(rs, model_dir, y_test, y_pred_test)


def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("corpus",
                        choices=CORPUS_PATH.keys(),
                        help="Corpus identifier.")
    parser.add_argument("run_id", help="Run identifier")
    parser.add_argument("feature_extractor",
                        choices=crf_util.FEATURE_EXTRACTOR.keys(),
                        help="Feature extractor.")
    parser.add_argument("--n_iter",
                        help="Number of random search trials",
                        default=1,
                        type=int)
    parser.add_argument("--n_jobs",
                        help="Number of concurrent jobs",
                        default=1,
                        type=int)
    return parser.parse_args()


if __name__ == '__main__':
    ARGS = arg_parser()
    ARGS.run_id = 'crf_' + ARGS.run_id
    logger.add(
        join(train_utils.model_dir(ARGS.corpus, ARGS.run_id), 'training.log'))
    main(ARGS)
Exemplo n.º 4
0
        plot_learning_curve(crf,
                            'CRF learning curve (sentences: N={})'.format(
                                len(X)),
                            X,
                            y,
                            out_dir=model_dir,
                            cv=5,
                            n_jobs=12)
    logger.info('Done...')


def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("corpus",
                        choices=CORPUS_PATH.keys(),
                        help="Corpus identifier.")
    parser.add_argument("run_id", help="Run identifier")
    parser.add_argument("feature_extractor",
                        choices=crf_util.FEATURE_EXTRACTOR.keys(),
                        help="Feature extractor.")
    return parser.parse_args()


if __name__ == '__main__':
    ARGS = arg_parser()
    ARGS.run_id = 'crf_' + ARGS.run_id
    logger.add(
        join(train_utils.model_dir(ARGS.corpus, ARGS.run_id),
             'learning-curve.log'))
    main(ARGS)
Exemplo n.º 5
0
def main(args):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)

    logger.info('Loaded corpus: {}'.format(corpus))
    model_dir = train_utils.model_dir(corpus.name, args.run_id)
    os.makedirs(model_dir, exist_ok=True)

    logger.info('Get sentences...')
    train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train,
                                                                  tokenizer,
                                                                  verbose=True)
    dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev,
                                                              tokenizer,
                                                              verbose=True)
    test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
                                                                tokenizer,
                                                                verbose=True)

    flair_corpus = flair_utils.FilteredCorpus(train=train_sents,
                                              dev=dev_sents,
                                              test=test_sents,
                                              ignore_sentence=_ignore_sentence)
    logger.info(flair_corpus)

    if args.model_file:
        logger.info('Load existing model from {}'.format(args.model_file))
        tagger = SequenceTagger.load(args.model_file)
    else:
        logger.info('Train model...')
        tagger = get_model(
            flair_corpus,
            corpus_name=args.corpus,
            embedding_lang=args.embedding_lang,
            pooled_contextual_embeddings=args.pooled_contextual_embeddings,
            contextual_forward_path=args.contextual_forward_path,
            contextual_backward_path=args.contextual_backward_path)

    if args.fine_tune or not args.model_file:
        trainer = ModelTrainer(tagger, flair_corpus)
        trainer.train(join(model_dir, 'flair'),
                      max_epochs=150,
                      monitor_train=False,
                      train_with_dev=args.train_with_dev)

        if not args.train_with_dev:
            # Model performance is judged by dev data, so we also pick the best performing model
            # according to the dev score to make our final predictions.
            tagger = SequenceTagger.load(
                join(model_dir, 'flair', 'best-model.pt'))
        else:
            # Training is stopped if train loss converges - here, we do not have a "best model" and
            # use the final model to make predictions.
            pass

    logger.info('Make predictions...')
    make_predictions(tagger, flair_corpus)

    train_utils.save_predictions(
        corpus_name=corpus.name,
        run_id=args.run_id,
        train=flair_utils.flair_sents_to_standoff(train_sents, train_docs),
        dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs),
        test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))
    evaluator.token_level_blind().to_csv(
        join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')


def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("corpus",
                        choices=CORPUS_PATH.keys(),
                        help="Corpus identifier.")
    parser.add_argument("run_id", help="Run identifier")
    parser.add_argument("--train_sample_frac",
                        help="Fraction of the training data to use.",
                        type=float,
                        default=0.1)
    parser.add_argument("--random_seed",
                        help="Seed for the training set sampler.",
                        type=int,
                        default=42)
    return parser.parse_args()


if __name__ == '__main__':
    ARGS = arg_parser()
    ARGS.run_id = 'bilstmcrf_{}_frac_{}_seed_{}'.format(
        ARGS.run_id, ARGS.train_sample_frac, ARGS.random_seed)
    MODEL_DIR = train_utils.model_dir(ARGS.corpus + '-subsets', ARGS.run_id)
    os.makedirs(MODEL_DIR, exist_ok=True)
    logger.add(join(MODEL_DIR, 'training.log'))
    main(ARGS, MODEL_DIR)