def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} ) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = TextClassifier.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / 'checkpoint.pt', 'TextClassifier', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path ) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / "checkpoint.pt", "SequenceTagger", corpus ) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') model = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint( (results_base_path / 'checkpoint.pt')) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) shutil.rmtree(results_base_path)
def train(params, tagger, corpus): if tagger is not None: trainer = create_trainer(tagger, corpus) else: print("Resuming training") trainer = ModelTrainer.load_from_checkpoint( Path(os.path.join(params["model_output_dirpath"], "checkpoint.pt")), 'SequenceTagger', corpus) # trainer.train('./models/tr_glove2_word2vec_embedding_150_epochs_0.15_lr', learning_rate=0.15, mini_batch_size=16, max_epochs=150, checkpoint=True) trainer.train(params["model_output_dirpath"], learning_rate=params["learning_rate"], mini_batch_size=params["mini_batch_size"], max_epochs=params["max_epochs"], patience=3, anneal_factor=0.5, anneal_against_train_loss=True, anneal_with_restarts=True, checkpoint=True, embeddings_in_memory=True)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() embeddings = FlairEmbeddings('news-forward-fast') document_embeddings = DocumentRNNEmbeddings([embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = TextClassifier.load_checkpoint( (results_base_path / 'checkpoint.pt')) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary(u'ner') embeddings = WordEmbeddings(u'glove') model = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=u'ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), u'SequenceTagger', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) shutil.rmtree(results_base_path)
test_file='test.txt', dev_file='dev.txt', **options['corpus']) print(corpus) # what tag to predict tag_type = 'ner' # make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # embeddings if len(args.embeddings_list) == 1: embeddings = FlairEmbeddings(args.embeddings_list[0]) else: embeddings = StackedEmbeddings( [FlairEmbeddings(lm) for lm in args.embeddings_list]) # initialize tagger tagger = SequenceTagger(embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, **options['sequences_tagger']) # initialize trainer if args.continue_training: checkpoint = tagger.load_checkpoint(args.train_path + 'checkpoint.pt') trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) else: trainer = ModelTrainer(tagger, corpus, use_tensorboard=args.tensorboard) # training trainer.train(args.train_path, **options['training'])
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-cased') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( base_path=file_path, #EvaluationMetric.MACRO_F1_SCORE, max_epochs=n_epochs, checkpoint=True) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
from pathlib import Path from flair.trainers import ModelTrainer from flair.models import SequenceTagger from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings from typing import List import flair, torch flair.device = torch.device('cpu') columns = {0: 'text', 1: 'ner'} data_folder = '../' corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt") trainer = ModelTrainer.load_from_checkpoint( Path('./models/example-ner-tr-embedding/checkpoint.pt'), 'SequenceTagger', corpus) trainer.train('./models/example-ner-tr-embedding-continued', learning_rate=0.15, mini_batch_size=32, max_epochs=150, checkpoint=True)