def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / 'checkpoint.pt', 'TextClassifier', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_tagger_disjunct_tags(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion_disjunct", column_format={ 0: "text", 3: "ner" }, ) tag_dictionary = corpus.make_label_dictionary("ner") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, allow_unk_predictions=True, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, )
def test_find_learning_rate(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} ) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) optimizer: Optimizer = SGD # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.find_learning_rate(results_base_path, iterations=5) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = FlairEmbeddings('news-forward-fast') tagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False) loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_load_use_classifier_with_prob(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence, multi_class_prob=True): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence, multi_class_prob=True) loaded_model.predict([sentence, sentence_empty], multi_class_prob=True) loaded_model.predict([sentence_empty], multi_class_prob=True) # clean up results directory shutil.rmtree(results_base_path)
def train(): columns = {0: 'text', 1: 'pos'} # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus('', columns, train_file=args.train, test_file=args.test, dev_file=args.dev) tag_dictionary = corpus.make_tag_dictionary(tag_type='pos') # initialize embeddings embedding_types: List[TokenEmbeddings] = [ CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(args.model, learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def test_train_optimizer_arguments(results_base_path, tasks_base_path): corpus = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') tagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) optimizer = AdamW trainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, weight_decay=0.001) loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() #document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( # [flair_embeddings], 128, 1, False #) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def test_train_optimizer_arguments(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) optimizer: Optimizer = AdamW # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=optimizer) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True, weight_decay=1e-3) loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train(self): path = "./src/tmp/" self.training_data = self.convert_format(self.training_data) corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'}, train_file=self.training_data ) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=True) self.trainer = ModelTrainer(tagger, corpus) save_path = path + self.model_name self.trainer.train(save_path,learning_rate=self.learning_rate,mini_batch_size=self.batch_size, max_epochs=self.nb_iter,embeddings_storage_mode=self.mode) self.is_ready = 1
def test_train_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() model: TextClassifier = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler, ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora( [NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / "multi_class") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, mini_batch_size=1, max_epochs=100, test_mode=True, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model = TextClassifier(document_embeddings=document_embeddings, label_dictionary=label_dict, multi_label=False, label_type="topic") # train model for 2 epochs trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del model # load the checkpoint model and train until epoch 4 checkpoint_model = TextClassifier.load(results_base_path / "checkpoint.pt") trainer.resume(model=checkpoint_model, max_epochs=4) # clean up results directory shutil.rmtree(results_base_path) del trainer
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def build_and_train_conll03en_flair_sequence_tagger(corpus,tag_type,tag_dictionary): ''' do not change! same configuration as described in file: "flair/resources/docs/EXPERIMENTS.md" section: "CoNLL-03 Named Entity Recognition (English)" ''' embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=[ WordEmbeddings("glove"), PooledFlairEmbeddings("news-forward", pooling="min"), PooledFlairEmbeddings("news-backward", pooling="min"), ] ) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, ) from flair.trainers import ModelTrainer corpus = Corpus(train=corpus.train, dev=corpus.dev,test=[]) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # trainer.train("resources/taggers/example-ner", train_with_dev=True, max_epochs=150) # original trainer.train("flair_checkpoints", train_with_dev=False, max_epochs=40,save_final_model=False) # original return tagger
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora([NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def train_por_tagger(): corpus = flair.datasets.UD_PORTUGUESE() tag_type = 'upos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) embedding_types = [WordEmbeddings('pt')] # embedding_types = [FastTextEmbeddings('/home/danielly/workspace/trained_pos_models/pt/pt.bin')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('/home/danielly/workspace', learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def test_train_load_use_tagger_large(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH).downsample(0.05) tag_dictionary = corpus.make_tag_dictionary('pos') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=32, max_epochs=2, test_mode=True) loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') model = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint( (results_base_path / 'checkpoint.pt')) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) shutil.rmtree(results_base_path)
def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = FlairEmbeddings('news-forward-fast') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=2, max_epochs=2, test_mode=True) loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def main(config, name, args): from flair.trainers import ModelTrainer from flair.visual.training_curves import Plotter from math import ceil from torch.optim import Adam from torch import manual_seed from pickle import load from discodop.lexgrammar import SupertagGrammar cp = corpusparam(**config["Corpus"], **config["Grammar"]) corpus = SupertagParseCorpus(cp.filename) grammar = load(open(f"{cp.filename}.grammar", "rb")) tc = FindlrParameters(**config["Training"], **config["Eval-common"], **config["Eval-Development"], language=cp.language) model = Supertagger.from_corpus(corpus, grammar, tc) model.set_eval_param(tc) if args.downsample: corpus = corpus.downsample(args.downsample) if args.iterations is None: epoch = ceil(len(corpus.train) / tc.batchsize) args.iterations = epoch * 5 trainer = ModelTrainer(model, corpus) learning_rate_tsv = trainer.find_learning_rate( name, start_learning_rate=args.min_lr, end_learning_rate=args.max_lr, iterations=args.iterations) plotter = Plotter() plotter.plot_learning_rate(learning_rate_tsv)
def _train(self, corpus: Corpus, params: dict, base_path: Path, max_epochs: int, optimization_value: str): corpus = corpus label_dict = corpus.make_label_dictionary() for sent in corpus.get_all_sentences(): sent.clear_embeddings() model = self._set_up_model(params, label_dict) training_parameters = { key: params[key] for key, value in params.items() if key in TRAINING_PARAMETERS } model_trainer_parameters = { key: params[key] for key, value in params.items() if key in MODEL_TRAINER_PARAMETERS and key != 'model' } trainer: ModelTrainer = ModelTrainer(model, corpus, **model_trainer_parameters) path = base_path results = trainer.train(path, max_epochs=max_epochs, param_selection_mode=True, **training_parameters) if optimization_value == "score": result = results['test_score'] else: result = results['dev_loss_history'][-1] return {'result': result, 'params': params}
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = TextClassifier.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def train(self, learning_rate: float = 0.1, mini_batch_size: int = 16, anneal_factor: float = 0.5, patience: int = 5, max_epochs: int = 10): """ :return: """ self.make_corpus() corpus = ClassificationCorpus(self.output_data_path, train_file='train.txt', dev_file='dev.txt', test_file='test.txt') label_dictionary = corpus.make_label_dictionary() embeddings = [WordEmbeddings('glove')] document_pool = DocumentPoolEmbeddings(embeddings) classifier = TextClassifier(document_pool, label_dictionary=label_dictionary) trainer = ModelTrainer(classifier, corpus) trainer.train( self.model_path, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, )
def test_train_load_use_tagger_large(results_base_path, tasks_base_path): corpus = flair.datasets.UD_ENGLISH().downsample(0.05) tag_dictionary = corpus.make_label_dictionary("pos") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="pos", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=32, max_epochs=2, shuffle=False, ) del trainer, tagger, tag_dictionary, corpus loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model
def main(data_folder: str, model_folder: str, dev_size: float, nb_epochs: int) -> None: nlp = spacy.blank('fr') nlp.tokenizer = get_tokenizer(nlp) corpus: Corpus = prepare_flair_train_test_corpus(spacy_model=nlp, data_folder=data_folder, dev_size=dev_size) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') print(tag_dictionary.idx2item) embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, use_crf=True, tag_dictionary=tag_dictionary, tag_type='ner') trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(model_folder, max_epochs=nb_epochs, mini_batch_size=32, embeddings_storage_mode="cpu", checkpoint=False)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embedding], 128, 1, False, 64, False, False ) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, shuffle=False ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_tars(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore") # create a TARS classifier tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base") # switch to a new task (TARS can do multiple tasks so you must define one) tars.add_and_switch_to_new_task(task_name="question 2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type='class'), label_type='class', ) # initialize the text classifier trainer trainer = ModelTrainer(tars, corpus) # start the training trainer.train(base_path='resources/taggers/trec', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=1, # mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=1, # terminate after 10 epochs ) sentence = Sentence("This is great!") tars.predict(sentence)