def test_train_load_use_classifier_flair(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() flair_document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [flair_embeddings], 128, 1, False, 64, False, False) model: TextClassifier = TextClassifier(flair_document_embeddings, label_dict, multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float del trainer, model, corpus, flair_document_embeddings loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def train_model(data_dir, max_epochs): st.write('Creating word corpus for training...') corpus = ClassificationCorpus(data_dir) label_dict = corpus.make_label_dictionary() st.write('Done') st.write('Load and create Embeddings for text data...') word_embeddings = [ WordEmbeddings('glove'), # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) st.write('Done') st.write('Preparing') classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus) trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=max_epochs, checkpoint=True) st.write('Model Training Finished!')
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model: TextClassifier = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, mini_batch_size=1, max_epochs=100, shuffle=False, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 32, 1, False, 64, False, False) model: TextClassifier = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler, ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, mini_batch_size=1, max_epochs=100, shuffle=False, checkpoint=False, train_with_test=True, train_with_dev=True, ) sentence = Sentence("apple tv") model.predict(sentence) for label in sentence.labels: print(label) assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float sentence = Sentence("apple tv") model.predict(sentence) assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) # We need to pass some dev data, otherwise flair raises a ZeroDivisionError # See: https://github.com/zalandoresearch/flair/issues/1139 # We just split the training sample into half and instruct Flair to train_with_dev (see below). half = len(train_sents_sample) // 2 flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half], dev=train_sents_sample[half:], test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) logger.info('Train model...') tagger = run_bilstmcrf.get_model(flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=True) trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=True, save_final_model=args.save_final_model) logger.info('Make predictions...') run_bilstmcrf.make_predictions(tagger, flair_corpus) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def test_text_classifier_multi(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_multi", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=DocumentPoolEmbeddings([turian_embeddings], fine_tune_mode="linear"), label_dictionary=label_dict, label_type="city", multi_label=True, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=2, max_epochs=50, shuffle=True) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_double = Sentence("this is Berlin and pizza") loaded_model.predict([sentence, sentence_double]) values = [] for label in sentence_double.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values assert "pizza" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") print(result.classification_report) assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName): ids = df['id'].tolist() nSamples = len(ids) idx80 = int(nSamples * 0.7) idx90 = int(nSamples * 0.9) train_ids = ids[:idx80] test_ids = ids[idx80:idx90] dev_ids = ids[idx90:] with TemporaryDirectory() as temp_dir: trainCsv = temp_dir + trainNameCsv testCsv = temp_dir + testNameCsv devCsv = temp_dir + devNameCsv df[df['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(test_ids)].to_csv(testCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(dev_ids)].to_csv(devCsv, columns=columns, sep='\t', index=False, header=False) corpus = NLPTaskDataFetcher.load_classification_corpus( temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv) word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(temp_dir, max_epochs=50) classifier.save(classifierFileName)
def run_zero_shot(train_tweets, train_y, val_tweets, val_y): """ Performs the training of the zero shot learning model @param train_tweets: the tweets that will be used for training @param train_y: the training labels @param val_tweets: the tweets that will be used for validation @param val_y: the validation labels @return: None """ # 1. Load our pre-trained TARS model for English print("Zero shot") # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt tars = TARSClassifier.load( os.path.join(os.path.dirname(__file__), "..", "..", "saved_models", "tars-base.pt")) train_tweets["output"] = train_y.iloc[:] train = train_tweets.apply(create_sentences, axis=1).tolist() train = SentenceDataset(train) val_tweets["output"] = val_y.iloc[:] val = val_tweets.apply(create_sentences, axis=1).tolist() val = SentenceDataset(val) corpus = Corpus(train=train, test=val) tars.add_and_switch_to_new_task( "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary()) trainer = ModelTrainer(tars, corpus) # 4. train model trainer.train( base_path='../../data/zero_shot', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=16, # small mini-batch size since corpus is tiny max_epochs=10, # terminate after 10 epochs ) print("DONE TRAINING") tars = TARSClassifier.load('../../model/zero_shot/final-model.pt') val_tweets["pred"] = val_tweets.apply(predict_few_shot, args=(tars, ), axis=1) val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1 if x == "positive" else -1) pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction']) pred.index += 1 pred.insert(0, 'Id', pred.index) pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
def main(args): args = parser.parse_args() # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( args.data_dir[0], train_file='train.txt', dev_file='dev.txt', test_file='test.txt') # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ELMoEmbeddings() ] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training model_out = 'resources/classifiers/sentence-classification/glove' trainer.train(model_out, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=100) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(join(model_out, 'loss.tsv')) plotter.plot_weights(join(model_out, 'weights.txt'))
def train(self): corpus = NLPTaskDataFetcher.load_classification_corpus(Path(self.corpus_path), test_file="test_clean_text.txt", dev_file="dev_clean_text.txt", train_file="train_clean_text.txt") embeddings = [WordEmbeddings(self.word_emb_path), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward')] document_embeddings = DocumentRNNEmbeddings(embeddings, hidden_size=self.hidden_size, bidirectional=True) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(self.model_path, evaluation_metric=EvaluationMetric.MACRO_F1_SCORE, max_epochs=self.epochs)
def fit(self, X, y): """ Build feature vectors and train FLAIR model. Parameters ---------- X : list(list(str)) list of sentences. Sentences are tokenized into list of words. y : list(list(str)) list of list of BIO tags. Returns ------- self """ log.info("Creating FLAIR corpus...") Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) sents_train = self._convert_to_flair(Xtrain, ytrain) sents_val = self._convert_to_flair(Xval, yval) corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") if self.embeddings is None: embedding_types = [ WordEmbeddings("glove"), CharacterEmbeddings() ] self.embeddings = StackedEmbeddings(embeddings=embedding_types) log.info("Building FLAIR NER...") self.model_ = SequenceTagger(hidden_size=self.hidden_dim, embeddings=self.embeddings, tag_dictionary=tag_dict, tag_type="ner", use_crf=self.use_crf, use_rnn=self.use_rnn, rnn_layers=self.num_rnn_layers, dropout=self.dropout, word_dropout=self.word_dropout, locked_dropout=self.locked_dropout) log.info("Training FLAIR NER...") opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam trainer = ModelTrainer(self.model_, corpus_train, opt) trainer.train(base_path=self.basedir, learning_rate=self.learning_rate, mini_batch_size=self.batch_size, max_epochs=self.max_iter) return self
def train(self): from flair.data import Corpus from flair.datasets import SentenceDataset from flair.data import Sentence self.classes = utils.read_class_titles(settings.CAT_DEPTH) self.classes['NOCAT'] = 'NOCAT' train = SentenceDataset([ Sentence(row['titlen']).add_label('law_topic', self.classes[row['cat1']]) for i, row in self.df_train.iterrows() ]) # make a corpus with train and test split self.corpus = Corpus(train=train, dev=train) # 1. load base TARS tars = self._load_pretained_model() # 2. make the model aware of the desired set of labels from the new corpus tars.add_and_switch_to_new_task( "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary()) # 3. initialize the text classifier trainer with your corpus from flair.trainers import ModelTrainer trainer = ModelTrainer(tars, self.corpus) # 4. train model path = settings.WORKING_DIR if 1: trainer.train( base_path=path, # path to store the model artifacts learning_rate=5e-2, # 5ep, 0.2 bad; 5ep with 0.1 looks ok. mini_batch_size=settings.MINIBATCH, # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=settings.EPOCHS, # terminate after 10 epochs train_with_dev=False, save_final_model=False, param_selection_mode=True, # True to avoid model saves shuffle=False, # Already done ) # from flair.models.text_classification_model import TARSClassifier # self.model = TARSClassifier.load( # os.path.join(path, 'best-model.pt') # ) self.model = tars
def classify(data, labels, test, train, validation): train_data = [k for k in data.keys() if k in train] train_labels = [labels[k] for k in train_data] train_data = [data[k] for k in train_data] test_data = [k for k in data.keys() if k in test] test_labels = [labels[k] for k in test_data] test_data = [data[k] for k in test_data] validation_data = [k for k in data.keys() if k in validation] validation_labels = [labels[k] for k in validation_data] validation_data = [data[k] for k in validation_data] save_training_files(train_data, train_labels, test_data, test_labels, validation_data, validation_labels) corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='test.txt', dev_file='dev.txt', train_file='train.txt') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward') ] doc_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( doc_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=25) classifier = TextClassifier.load_from_file('./best-model.pt') validation_data = [Sentence(x) for x in validation_data] for x in validation_data: classifier.predict(x) predicted = [int(x.labels[0].value) for x in validation_data] remove_training_files() precision, recall, f1, _ = precision_recall_fscore_support( validation_labels, predicted, average='binary') return { 'accuracy': float("{:.3f}".format(round(precision, 3))), 'recall': float("{:.3f}".format(round(recall, 3))), 'f1': float("{:.3f}".format(round(f1, 3))) }
def fit(self, corpus: Corpus, model_path: str): self.model = TARSClassifier( task_name="ChemicalUnderstanding", label_dictionary=corpus.make_label_dictionary(), ) trainer = ModelTrainer(self.model, corpus) trainer.train( base_path=model_path, learning_rate=0.02, mini_batch_size=16, mini_batch_chunk_size=4, max_epochs=10, )
def train_model(self, corpus, classifier, step_num, optimizer_state=None, epoch=1, lr=1e-3): trainer = ModelTrainer(classifier, corpus, optimizer=AdamW, optimizer_state=optimizer_state) result = trainer.train( f'{self.experiment_name}/{step_num}/', learning_rate=lr, min_learning_rate=1e-8, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=epoch, embeddings_storage_mode=self.embeddings_storage_mode, weight_decay=1e-4, ) os.system(f'rm {self.experiment_name}/{step_num-3}/best-model.pt') os.system(f'rm {self.experiment_name}/{step_num-3}/final-model.pt') return classifier, result['optimizer_state_dict']
def train(self): tox_corpus = FlairTox21().to_corpus() self.model = TARSClassifier( task_name="Toxicity", label_dictionary=tox_corpus.make_label_dictionary(), document_embeddings="distilbert-base-uncased", ) trainer = ModelTrainer(self.model, tox_corpus) trainer.train( base_path=get_path("model") / self.filename, learning_rate=0.02, mini_batch_size=1, max_epochs=10, )
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint(results_base_path / 'checkpoint.pt', 'TextClassifier', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def train(self, train_data): flair_logger = logging.getLogger("flair") handler = RequestsHandler() flair_logger.addHandler(handler) filter = ListenFilter("filter", self.args) flair_logger.addFilter(filter) trainer = ModelTrainer(self.model, self.corpus) trainer.train(self.model_base_path(), learning_rate=0.1, mini_batch_size=32, max_epochs=self.args.iter if self.args.max_iter else 100, train_with_dev=True, monitor_test=True, embeddings_storage_mode="gpu")
def train_ner(device_category): """ Training the sequence labeling model """ columns = {0: 'text', 1: 'ner'} training_file = os.path.join( root_path, 'part_extraction/data/{}.conll'.format(device_category)) data_folder = os.path.join(root_path, 'part_extraction/data') corpus = ColumnCorpus(data_folder, columns, train_file=training_file) print(len(corpus.train)) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) embedding_types = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(ner_models, learning_rate=0.1, mini_batch_size=32, max_epochs=150) trainer.model.save('{}/{}.pt'.format(ner_models, device_category))
def handle(self, *args, **options): file = options.get('file') or 'annotated_sentences' model_folder = options.get('model_folder') or 'model-var' columns = {0: 'text', 1: 'var'} data_folder = 'data/txt' corpus = ColumnCorpus(data_folder, columns, train_file=f'{file}.txt') tag_type = 'var' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), TransformerWordEmbeddings('bert-base-uncased'), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer = ModelTrainer(tagger, corpus) trainer.train(f'data/models/taggers/{model_folder}', learning_rate=0.1, mini_batch_size=32, max_epochs=150) self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
def start(self) -> None: self.stacked_embeddings = self._get_stacked_embeddings() description = self.experiment.description.replace(" ", "_") batch_size = self.experiment.batch_size max_epochs = self.experiment.max_epochs embeddings_storage_mode = self.experiment.embeddings_storage_mode train_with_dev = self.experiment.train_with_dev tagger, corpus = self._get_sequence_tagger() trainer = ModelTrainer(tagger, corpus) trainer.train( f"resources/taggers/experiment_{description}_{self.number}", learning_rate=0.1, mini_batch_size=batch_size, max_epochs=max_epochs, embeddings_storage_mode=embeddings_storage_mode, train_with_dev=train_with_dev, )
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpora([NLPTask.FASHION, NLPTask.GERMEVAL], base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') model: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint(results_base_path / 'checkpoint.pt', 'SequenceTagger', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() word_embedding = WordEmbeddings('turian') document_embeddings = DocumentRNNEmbeddings([word_embedding], 32, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler) sentence = Sentence('Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load((results_base_path / 'final-model.pt')) shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'imdb')) label_dict = corpus.make_label_dictionary() embeddings = FlairEmbeddings('news-forward-fast') document_embeddings = DocumentRNNEmbeddings([embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = TextClassifier.load_checkpoint( (results_base_path / 'checkpoint.pt')) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) shutil.rmtree(results_base_path)
def fine_tune(self): if isinstance(self.document_embedding, TransformerDocumentEmbeddings): corpus = TREC_6() label_dict = corpus.make_label_dictionary() classifier = TextClassifier(self.document_embedding, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # 6. start the training trainer.train( 'resources/taggers/trec', learning_rate=3e-5, # use very small learning rate mini_batch_size=16, mini_batch_chunk_size= 4, # optionally set this if transformer is too much for your machine max_epochs=5, # terminate after 5 epochs ) else: raise UserWarning( "No fine tuning for this embedding type implemented")
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir): flair.device = torch.device(device) corpus = ClassificationCorpus(rootdir, train_file=train, dev_file=dev, test_file=test, in_memory=False) label_dict = corpus.make_label_dictionary() # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') optional_embedding = ELMoEmbeddings('original') word_embeddings = list(filter(None, [ optional_embedding, FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])) # Initialize document embedding by passing list of word embeddings # # Note this will kick off model generation that will take a long time (several hours) # This will produce final-model.pt and best-model.pt files which represent a stored trained model. document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(outputdir, max_epochs=num_epochs)
def test_train_load_use_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler, ) sentence = Sentence("Berlin is a really nice city.") model.predict(sentence) for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model