def train(): corpus: Corpus = ClassificationCorpus(sst_folder, test_file='test.csv', dev_file='dev.csv', train_file='sst_dev.csv') label_dict = corpus.make_label_dictionary() stacked_embedding = WordEmbeddings('glove') # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(model_path, max_epochs=10, train_with_dev=False)
def train(): # load training data in FastText format corpus = NLPTaskDataFetcher.load_classification_corpus( Path('./'), test_file='./data/test.txt', train_file='./data/train.txt') # Combine different embeddings: # Glove word ebmeddings + Flair contextual string embeddings word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] # use LSTM based method for combining the different embeddings document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train('./models', max_epochs=10)
def optimize_lr(): corpus, label_dictionary = load_corpus() embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, bidirectional=True) classifier = TextClassifier(document_embeddings, label_dictionary=label_dictionary, multi_label=False) trainer = ModelTrainer(classifier, corpus) # 7. find learning rate learning_rate_tsv = trainer.find_learning_rate('resources/classifiers/', 'learning_rate.tsv') # 8. plot the learning rate finder curve from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_learning_rate(learning_rate_tsv)
def run_splits(word_embeddings, embeddings_name): for i in range(1, 6): print('##########') print('Split', str(i)) print('##########') data_folder = '<path_to_splits>/split_' + str(i) + '/' corpus = ClassificationCorpus(data_folder, test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
def train(self, learning_rate: float = 0.1, mini_batch_size: int = 16, anneal_factor: float = 0.5, patience: int = 5, max_epochs: int = 10): """ :return: """ self.make_corpus() corpus = ClassificationCorpus(self.output_data_path, train_file='train.txt', dev_file='dev.txt', test_file='test.txt') label_dictionary = corpus.make_label_dictionary() embeddings = [WordEmbeddings('glove')] document_pool = DocumentPoolEmbeddings(embeddings) classifier = TextClassifier(document_pool, label_dictionary=label_dictionary) trainer = ModelTrainer(classifier, corpus) trainer.train( self.model_path, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, )
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding = FlairEmbeddings(u'news-forward-fast') document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True) sentence = Sentence(u'Berlin is a really nice city.') for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file( (results_base_path / u'final-model.pt')) sentence = Sentence(u'I love Berlin') sentence_empty = Sentence(u' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def train(): # Get the SST-5 corpus corpus: Corpus = SENTEVAL_SST_GRANULAR() # create the label dictionary label_dict = corpus.make_label_dictionary() # make a list of word embeddings ( Using Glove for testing ) word_embeddings = [WordEmbeddings('glove')] # initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # start the training trainer.train('resources/taggers/trec', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, embeddings_storage_mode='gpu', max_epochs=15)
def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_classifier_with_sampler(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() model: TextClassifier = TextClassifier(document_embeddings, label_dict, multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, max_epochs=2, shuffle=False, sampler=ImbalancedClassificationDatasetSampler, ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() #document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( # [flair_embeddings], 128, 1, False #) model = TextClassifier(document_embeddings, label_dict, multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / "multi_class" ) label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings = DocumentRNNEmbeddings( embeddings=[word_embedding], hidden_size=32, reproject_words=False, bidirectional=False, ) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MICRO_F1_SCORE, mini_batch_size=1, max_epochs=100, test_mode=True, checkpoint=False, ) sentence = Sentence("apple tv") for s in model.predict(sentence): for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float sentence = Sentence("apple tv") for s in model.predict(sentence): assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for l in s.labels: print(l) assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus("imdb", base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embedding], 128, 1, False, 64, False, False ) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=2, test_mode=True ) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load_from_file(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.load_corpus('imdb', base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) trainer = ModelTrainer.load_from_checkpoint( results_base_path / 'checkpoint.pt', 'TextClassifier', corpus) trainer.train(results_base_path, max_epochs=2, test_mode=True, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, multi_label=False, label_type="topic", ) # train model for 2 epochs trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del model # load the checkpoint model and train until epoch 4 checkpoint_model = TextClassifier.load(results_base_path / "checkpoint.pt") with pytest.warns(UserWarning): trainer.resume(model=checkpoint_model, max_epochs=4) del trainer
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") model.predict(sentence) for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model
def train_model(data_dir, max_epochs): st.write('Creating word corpus for training...') corpus = ClassificationCorpus(data_dir) label_dict = corpus.make_label_dictionary() st.write('Done') st.write('Load and create Embeddings for text data...') word_embeddings = [ WordEmbeddings('glove'), # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) st.write('Done') st.write('Preparing') classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus) trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=max_epochs, checkpoint=True) st.write('Model Training Finished!')
def test_train_load_use_classifier_with_prob(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() word_embedding: WordEmbeddings = WordEmbeddings("turian") document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [word_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence, multi_class_prob=True): for l in s.labels: assert l.value is not None assert 0.0 <= l.score <= 1.0 assert type(l.score) is float loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence, multi_class_prob=True) loaded_model.predict([sentence, sentence_empty], multi_class_prob=True) loaded_model.predict([sentence_empty], multi_class_prob=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() embeddings: TokenEmbeddings = FlairEmbeddings("news-forward-fast", use_cache=False) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( [embeddings], 128, 1, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = TextClassifier.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def test_train_resume_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model = TextClassifier(document_embeddings=document_embeddings, label_dictionary=label_dict, multi_label=False, label_type="topic") trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def train(args): """Train.""" start_time = time.time() column_format = {i: col for i, col in enumerate(args.data_columns)} corpus: Corpus = ClassColumnCorpus( args.data_dir, column_format, train_file=args.train_file, dev_file=args.dev_file, comment_symbol=args.comment_symbol, label_symbol=args.label_symbol, ) tag_type = args.data_columns[-1] tag_dict = corpus.make_tag_dictionary(tag_type=tag_type) label_dict = corpus.make_label_dictionary() vocab = corpus.make_vocab_dictionary().get_items() embeddings = utils.init_embeddings(vocab, args) model1: SequenceTagger = SequenceTagger( hidden_size=args.hidden_size, embeddings=embeddings, tag_dictionary=tag_dict, tag_type=tag_type, column_format=column_format, use_crf=True, use_attn=args.use_attn, attn_type=args.attn_type, num_heads=args.num_heads, scaling=args.scaling, pooling_operation=args.pooling_operation, use_sent_query=args.use_sent_query, ) document_embeddings = DocumentRNNEmbeddings( [embeddings], hidden_size=args.hidden_size, ) model2 = TextClassifier(document_embeddings, label_dictionary=label_dict) utils.init_joint_models(model1, model2, args) trainer: JointModelTrainer = JointModelTrainer( model1, model2, corpus, utils.optim_method(args.optim) ) trainer.train( args.model_dir, mini_batch_size=args.mini_batch_size, max_epochs=args.max_epochs, anneal_factor=args.anneal_factor, learning_rate=args.learning_rate, patience=args.patience, min_learning_rate=args.min_learning_rate, embeddings_storage_mode=args.embeddings_storage_mode, gamma=args.gamma, ) logger.info("End of training: time %.1f min", (time.time() - start_time) / 60)
def test_text_classifier_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_single", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=TransformerDocumentEmbeddings( "distilbert-base-uncased"), label_dictionary=label_dict, label_type="city", multi_label=False, ) trainer = ModelTrainer(model, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-5, num_workers=2, ) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") loaded_model.predict([sentence, sentence_empty]) values = [] for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "multi_class", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dict, label_type="topic", multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train( results_base_path, mini_batch_size=1, max_epochs=100, shuffle=False, checkpoint=False, train_with_test=True, train_with_dev=True, ) sentence = Sentence("apple tv") model.predict(sentence) for label in sentence.labels: print(label) assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float sentence = Sentence("apple tv") model.predict(sentence) assert "apple" in sentence.get_label_names() assert "tv" in sentence.get_label_names() for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float del trainer, model, corpus loaded_model = TextClassifier.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def __init__( self, task_name: str, label_dictionary: Dictionary, label_type: str, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. :param multi_label: auto-detected by default, but you can set this to True to force multi-label predictionor False to force single-label prediction :param multi_label_threshold: If multi-label you can set the threshold to make predictions :param beta: Parameter for F-beta score for evaluation and training annealing """ super(TARSClassifier, self).__init__() from flair.embeddings import TransformerDocumentEmbeddings if not isinstance(embeddings, TransformerDocumentEmbeddings): embeddings = TransformerDocumentEmbeddings(model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item('False') tars_dictionary.add_item('True') # initialize a bare-bones sequence tagger self.tars_model = TextClassifier(document_embeddings=embeddings, label_dictionary=tars_dictionary, label_type=self.static_label_type, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)
def test_train_load_use_classifier_multi_label(results_base_path, tasks_base_path): # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path) corpus = NLPTaskDataFetcher.load_classification_corpus( data_folder=tasks_base_path / 'multi_class') label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding], hidden_size=32, reproject_words=False, bidirectional=False) model = TextClassifier(document_embeddings, label_dict, multi_label=True) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, EvaluationMetric.MICRO_F1_SCORE, max_epochs=100, test_mode=True, checkpoint=False) sentence = Sentence('apple tv') for s in model.predict(sentence): for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) sentence = Sentence("apple tv") for s in model.predict(sentence): assert ('apple' in sentence.get_label_names()) assert ('tv' in sentence.get_label_names()) for l in s.labels: print(l) assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_text_classifier_multi(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_multi", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=DocumentPoolEmbeddings([turian_embeddings], fine_tune_mode="linear"), label_dictionary=label_dict, label_type="city", multi_label=True, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=2, max_epochs=50, shuffle=True) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_double = Sentence("this is Berlin and pizza") loaded_model.predict([sentence, sentence_double]) values = [] for label in sentence_double.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values assert "pizza" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") print(result.classification_report) assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName): ids = df['id'].tolist() nSamples = len(ids) idx80 = int(nSamples * 0.7) idx90 = int(nSamples * 0.9) train_ids = ids[:idx80] test_ids = ids[idx80:idx90] dev_ids = ids[idx90:] with TemporaryDirectory() as temp_dir: trainCsv = temp_dir + trainNameCsv testCsv = temp_dir + testNameCsv devCsv = temp_dir + devNameCsv df[df['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(test_ids)].to_csv(testCsv, columns=columns, sep='\t', index=False, header=False) df[df['id'].isin(dev_ids)].to_csv(devCsv, columns=columns, sep='\t', index=False, header=False) corpus = NLPTaskDataFetcher.load_classification_corpus( temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv) word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(temp_dir, max_epochs=50) classifier.save(classifierFileName)
def main(args): args = parser.parse_args() # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus( args.data_dir[0], train_file='train.txt', dev_file='dev.txt', test_file='test.txt') # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), # ELMoEmbeddings() ] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=128, reproject_words=True, reproject_words_dimension=64, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training model_out = 'resources/classifiers/sentence-classification/glove' trainer.train(model_out, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=100) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves(join(model_out, 'loss.tsv')) plotter.plot_weights(join(model_out, 'weights.txt'))
def test_labels_to_indices(tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news") label_dict = corpus.make_label_dictionary() model = TextClassifier(document_embeddings, label_dict, multi_label=False) result = model._labels_to_indices(corpus.train) for i in range(len(corpus.train)): expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].value) actual = result[i].item() assert expected == actual