def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') docs = list(itertools.chain(corpus.train, corpus.dev)) sents, _ = tagging_utils.standoff_to_sents(docs, tokenizer, verbose=True) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[ args.feature_extractor] X, y = crf_labeler.sents_to_features_and_labels(sents, feature_extractor) logger.info('len(X) = {}'.format(len(X))) logger.info('len(y) = {}'.format(len(y))) crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) logger.info('Start learing curve computation...') with parallel_backend('multiprocessing'): # scikit-learn changed the default multiprocessing to 'loky' in 0.21. It appears that this # is not supported by sklearn_crfsuite. Therefore, we switch to the legacy 'multiprocessing' # parallel backend. plot_learning_curve(crf, 'CRF learning curve (sentences: N={})'.format( len(X)), X, y, out_dir=model_dir, cv=5, n_jobs=12) logger.info('Done...')
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') train_sents, train_docs = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True) dev_sents, dev_docs = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[ args.feature_extractor] X_train, y_train = crf_labeler.sents_to_features_and_labels( train_sents, feature_extractor) X_dev, y_dev = crf_labeler.sents_to_features_and_labels( dev_sents, feature_extractor) X_test, y_test = crf_labeler.sents_to_features_and_labels( test_sents, feature_extractor) logger.info('len(X_train) = {}'.format(len(X_train))) logger.info('len(y_train) = {}'.format(len(y_train))) logger.info('len(X_dev) = {}'.format(len(X_dev))) logger.info('len(X_test) = {}'.format(len(X_test))) X_train_combined = X_train + X_dev y_train_combined = y_train + y_dev train_indices = [-1] * len(X_train) dev_indices = [0] * len(X_dev) test_fold = train_indices + dev_indices labels = list(set(label for sent in y_train_combined for label in sent)) labels.remove('O') logger.info('Labels: {}'.format(labels)) f1_scorer = make_scorer(flat_f1_score, labels=labels, average='micro') crf = crf_labeler.SentenceFilterCRF(ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) ps = PredefinedSplit(test_fold) rs = RandomizedSearchCV(crf, PARAM_SPACE, cv=ps, verbose=1, n_jobs=args.n_jobs, n_iter=args.n_iter, scoring=f1_scorer, return_train_score=True) logger.info('Start RandomizedSearchCV... {}'.format(crf)) with parallel_backend('multiprocessing'): rs.fit(X_train_combined, y_train_combined) logger.info('best params: {}'.format(rs.best_params_)) logger.info('best CV score: {}'.format(rs.best_score_)) logger.info('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) logger.info('Make predictions...') crf = rs.best_estimator_ y_pred_train = crf.predict(X_train) y_pred_dev = crf.predict(X_dev) y_pred_test = crf.predict(X_test) train_utils.save_predictions( corpus_name=corpus.name, run_id=args.run_id, train=tagging_utils.sents_to_standoff(y_pred_train, train_docs), dev=tagging_utils.sents_to_standoff(y_pred_dev, dev_docs), test=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) _save_model_aritfacts(rs, model_dir, y_test, y_pred_test)
test=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) _save_model_aritfacts(rs, model_dir, y_test, y_pred_test) def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("corpus", choices=CORPUS_PATH.keys(), help="Corpus identifier.") parser.add_argument("run_id", help="Run identifier") parser.add_argument("feature_extractor", choices=crf_util.FEATURE_EXTRACTOR.keys(), help="Feature extractor.") parser.add_argument("--n_iter", help="Number of random search trials", default=1, type=int) parser.add_argument("--n_jobs", help="Number of concurrent jobs", default=1, type=int) return parser.parse_args() if __name__ == '__main__': ARGS = arg_parser() ARGS.run_id = 'crf_' + ARGS.run_id logger.add( join(train_utils.model_dir(ARGS.corpus, ARGS.run_id), 'training.log')) main(ARGS)
plot_learning_curve(crf, 'CRF learning curve (sentences: N={})'.format( len(X)), X, y, out_dir=model_dir, cv=5, n_jobs=12) logger.info('Done...') def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("corpus", choices=CORPUS_PATH.keys(), help="Corpus identifier.") parser.add_argument("run_id", help="Run identifier") parser.add_argument("feature_extractor", choices=crf_util.FEATURE_EXTRACTOR.keys(), help="Feature extractor.") return parser.parse_args() if __name__ == '__main__': ARGS = arg_parser() ARGS.run_id = 'crf_' + ARGS.run_id logger.add( join(train_utils.model_dir(ARGS.corpus, ARGS.run_id), 'learning-curve.log')) main(ARGS)
def main(args): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) model_dir = train_utils.model_dir(corpus.name, args.run_id) os.makedirs(model_dir, exist_ok=True) logger.info('Get sentences...') train_sents, train_docs = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, dev_docs = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) flair_corpus = flair_utils.FilteredCorpus(train=train_sents, dev=dev_sents, test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) if args.model_file: logger.info('Load existing model from {}'.format(args.model_file)) tagger = SequenceTagger.load(args.model_file) else: logger.info('Train model...') tagger = get_model( flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=args.pooled_contextual_embeddings, contextual_forward_path=args.contextual_forward_path, contextual_backward_path=args.contextual_backward_path) if args.fine_tune or not args.model_file: trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=args.train_with_dev) if not args.train_with_dev: # Model performance is judged by dev data, so we also pick the best performing model # according to the dev score to make our final predictions. tagger = SequenceTagger.load( join(model_dir, 'flair', 'best-model.pt')) else: # Training is stopped if train loss converges - here, we do not have a "best model" and # use the final model to make predictions. pass logger.info('Make predictions...') make_predictions(tagger, flair_corpus) train_utils.save_predictions( corpus_name=corpus.name, run_id=args.run_id, train=flair_utils.flair_sents_to_standoff(train_sents, train_docs), dev=flair_utils.flair_sents_to_standoff(dev_sents, dev_docs), test=flair_utils.flair_sents_to_standoff(test_sents, test_docs))
evaluator.token_level_blind().to_csv( join(model_dir, 'scores_token_blind.csv')) logger.info('Done.') def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("corpus", choices=CORPUS_PATH.keys(), help="Corpus identifier.") parser.add_argument("run_id", help="Run identifier") parser.add_argument("--train_sample_frac", help="Fraction of the training data to use.", type=float, default=0.1) parser.add_argument("--random_seed", help="Seed for the training set sampler.", type=int, default=42) return parser.parse_args() if __name__ == '__main__': ARGS = arg_parser() ARGS.run_id = 'bilstmcrf_{}_frac_{}_seed_{}'.format( ARGS.run_id, ARGS.train_sample_frac, ARGS.random_seed) MODEL_DIR = train_utils.model_dir(ARGS.corpus + '-subsets', ARGS.run_id) os.makedirs(MODEL_DIR, exist_ok=True) logger.add(join(MODEL_DIR, 'training.log')) main(ARGS, MODEL_DIR)