def test_train_load_use_tagger(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} ) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, ) loaded_model: SequenceTagger = SequenceTagger.load( results_base_path / "final-model.pt" ) sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train(data_folder, model_output_folder): corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus( NLPTask.CONLL_03, base_path=data_folder) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(model_output_folder, mini_batch_size=256, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(model_output_folder + '/loss.tsv') plotter.plot_weights(model_output_folder + '/weights.txt')
def get_corpus_and_tagger(): columns = {0: 'text', 1: 'ner'} data_folder = 'data/' # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train_IOB_Format_file.txt', test_file='train_IOB_Format_file.txt', dev_file="dev_IOB_format_file.txt") tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings. FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) return corpus, tagger
def test_train_optimizer(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) optimizer: Optimizer = Adam # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_tagger_adam(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 3: "ner" }) tag_dictionary = corpus.make_label_dictionary("ner") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, optimizer=Adam, ) del trainer, tagger, tag_dictionary, corpus loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model
def train_cat_tagger(): corpus = flair.datasets.UD_CATALAN() tag_type = 'pos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [WordEmbeddings('ca')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('/home/danielly/workspace', learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def get_model(corpus: flair.data.Corpus, corpus_name: str, pooled_contextual_embeddings: bool, contextual_forward_path: str = None, contextual_backward_path: str = None): tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = get_embeddings( corpus_name=corpus_name, pooled=pooled_contextual_embeddings, contextual_forward_path=contextual_forward_path, contextual_backward_path=contextual_backward_path) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) return tagger
def main( data_folder: str, model_folder: str, dev_size: float, nb_epochs: int, nb_segment: Optional[int], segment: Optional[int], ) -> None: nlp = spacy.blank(name="fr") nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_dev_corpus( spacy_model=nlp, data_folder=data_folder, dev_size=dev_size, nb_segment=nb_segment, segment=segment ) tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings("fr"), FlairEmbeddings("fr-forward"), FlairEmbeddings("fr-backward"), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type="ner" ) trainer: ModelTrainer = ModelTrainer(model=tagger, corpus=corpus, use_tensorboard=False) # TODO optimize LR https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md trainer.train( model_folder, max_epochs=nb_epochs, learning_rate=0.1, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False, )
def run(args): # 1. get the corpus corpus: Corpus = WNUT_17() # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('crawl'), WordEmbeddings('twitter'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, use_tensorboard=True) trainer.train( args.job_dir, train_with_dev=True, max_epochs=args.epochs ) upload_results(args)
def trainModel(serial_no): # define columns columns = {0: 'text', 1: 'ner'} # directory where the data resides data_folder = 'dummy-data/dummy-data-' + str(serial_no) + '/' # initializing the corpus from flair.datasets import ColumnCorpus corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') # Tag to predict tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Use flair embeddings from flair.embeddings import WordEmbeddings, StackedEmbeddings from typing import List embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # Initialize sequence tagger (bi-LSTM, CRF) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) print(tagger) # Train model from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('dummy-model/dummy-model-' + str(serial_no), learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path ) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True, ) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / "final-model.pt" ) sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train(self, trainfile, devfile, testfile, resfolder, embtype="bert", chunk_len=100, batch_len=8): """ *** This method can be used to train new models with the settings used in project Redewiedergabe It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-) *** :param trainfile: :param devfile: :param testfile: :param resfolder: :param embtype: :param chunk_len: :param batch_len: :return: """ emb_name, embeddings = self._get_embeddings(embtype) corpus: Corpus = self.create_corpus(trainfile, devfile, testfile, chunk_len) tag_dictionary = corpus.make_tag_dictionary(tag_type="cat") if not os.path.exists(resfolder): os.makedirs(resfolder) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="cat", use_crf=True, rnn_layers=2 ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(resfolder, learning_rate=0.1, mini_batch_size=batch_len, max_epochs=150, checkpoint=True) # plot training curves plotter = Plotter() plotter.plot_training_curves(os.path.join(resfolder, 'loss.tsv')) plotter.plot_weights(os.path.join(resfolder, 'weights.txt'))
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 2: "ner" }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_tagger_large(results_base_path, tasks_base_path): corpus = flair.datasets.UD_ENGLISH().downsample(0.05) tag_dictionary = corpus.make_tag_dictionary('pos') embeddings = WordEmbeddings('turian') tagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=False) trainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=32, max_epochs=2, shuffle=False) loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = FlairEmbeddings("news-forward-fast", use_cache=False) tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train(self, training_dir=None): from flair.trainers import ModelTrainer if training_dir is None: training_dir = script_dir + "flair" + os.sep # define columns columns = {0: "text", 1: "ner"} # this is the folder in which train, test and dev files reside data_folder = training_dir + "data" # init a corpus using column format, data folder and the names of the train, dev and test files # note that training data should be unescaped, i.e. tokens like "&", not "&" corpus: Corpus = ColumnCorpus( data_folder, columns, train_file="sent_train.txt", test_file="sent_test.txt", dev_file="sent_dev.txt", ) print(corpus) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # initialize embeddings embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base') tagger: SequenceTagger = SequenceTagger( hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(training_dir, learning_rate=0.1, mini_batch_size=32, max_epochs=50) self.model = tagger
def test_find_learning_rate(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) optimizer: Optimizer = SGD # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.find_learning_rate(results_base_path, iterations=5) # clean up results directory shutil.rmtree(results_base_path)
def __init__(self, args, model_name, load_model=False): super().__init__(args, model_name, load_model) self.embeds_unsupported_langs = ("am", "vi") (data_folder, train_file, test_file, dev_file) = self.format_data("train") self.corpus = UniversalDependenciesCorpus(data_folder, train_file, test_file, dev_file, split_multiwords=False) dictionary = self.corpus.make_tag_dictionary("upos") if not load_model: embeddings = self.get_embeddings() self.model = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=dictionary, tag_type="upos", rnn_layers=2, use_crf=True) else: self.model.tag_dictionary = dictionary
def test_train_load_use_tagger_large(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH).downsample(0.05) tag_dictionary = corpus.make_tag_dictionary("pos") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="pos", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=32, max_epochs=2, test_mode=True, ) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / "final-model.pt" ) sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_optimizer_arguments(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) optimizer: Optimizer = AdamW # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True, weight_decay=1e-3) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_find_learning_rate(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 3: "ner" }) tag_dictionary = corpus.make_label_dictionary("ner") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.find_learning_rate(results_base_path, optimizer=SGD, iterations=5) del trainer, tagger, tag_dictionary, corpus
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary(u'ner') embeddings = WordEmbeddings(u'glove') model = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=u'ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( (results_base_path / u'checkpoint.pt'), u'SequenceTagger', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) shutil.rmtree(results_base_path)
def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = CharLMEmbeddings('news-forward-fast', use_cache=False) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_tagger_large(results_base_path, tasks_base_path): corpus = flair.datasets.UD_ENGLISH().downsample(0.05) tag_dictionary = corpus.make_tag_dictionary("pos") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="pos", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=32, max_epochs=2, shuffle=False, ) del trainer, tagger, tag_dictionary, corpus loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_train_resume_tagger(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 3: "ner" }) corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL( base_path=tasks_base_path).downsample(0.1) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_label_dictionary("ner") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # train model for 2 epochs trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del model # load the checkpoint model and train until epoch 4 checkpoint_model = SequenceTagger.load(results_base_path / "checkpoint.pt") trainer.resume(model=checkpoint_model, max_epochs=4) # clean up results directory del trainer
def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') # make a temporary cache directory that we remove afterwards cache_dir = results_base_path / 'cache' os.makedirs(cache_dir, exist_ok=True) embeddings = FlairEmbeddings('news-forward-fast', cache_directory=cache_dir) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, EvaluationMetric.MACRO_ACCURACY, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) # remove the cache directory shutil.rmtree(cache_dir) loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train_rus_tagger(): corpus = flair.datasets.UD_RUSSIAN() tag_type = 'upos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [ FastTextEmbeddings( '/home/danielly/workspace/trained_pos_models/ru/ru.bin') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('/home/danielly/workspace', learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary(u'ner') embeddings = WordEmbeddings(u'glove') tagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=u'ner', use_crf=False) trainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) loaded_model = SequenceTagger.load_from_file( (results_base_path / u'final-model.pt')) sentence = Sentence(u'I love Berlin') sentence_empty = Sentence(u' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ #WordEmbeddings('glove'), BytePairEmbeddings('en') ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/bpe', learning_rate=0.1, mini_batch_size=32, max_epochs=100)
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size, skf_split_no): """ Trains the sequence labeling model (by default model uses one RNN layer). Model is trained to predict part of speech tag and takes into account information about: - text (plain text made of tokens that together form a sentence), - occurrence of separator before token, - proposed tags for given token. It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded using a concatenation of two vector embeddings: - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. There are forward (that goes through the given on input plain text form left to right) and backward model (that goes through the given on input plain text form right to left) used for part of speech (pos) tag training. - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of the most common words seen in the corpus, plus an UNK token for all rare words. There are two One Hot Embeddings used in training: - first to embed information about occurrence of separator before token, - second to embed information about concatenated with a ';' proposed tags. Model and training logs are saved in resources/taggers/example-pos directory. This is the method where internal states of forward and backward Flair models are taken at the end of each token and, supplemented by information about occurrence of separator before token and proposed tags for given token used to train model for one of stratified 10 fold cross validation splits. :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize ColumnCorpus object :param proposed_tags_vocabulary_size: number of proposed tags :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10) used to train the model """ # define columns columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'} # init a corpus using column format, data folder and the names of the train and test files # 1. get the corpus corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train_' + str(skf_split_no), test_file='test_' + str(skf_split_no), dev_file=None) log.info(corpus) # 2. what tag do we want to predict tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) log.info(tag_dictionary) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ FlairEmbeddings('pl-forward', chars_per_chunk=64), FlairEmbeddings('pl-backward', chars_per_chunk=64), OneHotEmbeddings(corpus=corpus, field='is_separator', embedding_length=3, min_freq=3), OneHotEmbeddings(corpus=corpus, field='proposed_tags', embedding_length=math.ceil( (proposed_tags_vocabulary_size + 1)**0.25), min_freq=3) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=False, rnn_layers=1) # 6. initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train( use_scratch_dir_if_available('resources/taggers/example-pos/it-' + str(skf_split_no)), learning_rate=0.1, mini_batch_size=32, embeddings_storage_mode='gpu', max_epochs=sys.maxsize, monitor_test=True) # 8. plot weight traces (optional) plotter = Plotter() plotter.plot_weights( use_scratch_dir_if_available('resources/taggers/example-pos/it-' + str(skf_split_no) + '/weights.txt'))