def test_init_tars_and_switch(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb") # create a TARS classifier tars = TARSClassifier(task_name='2_CLASS', label_dictionary=corpus.make_label_dictionary(label_type='class'), label_type='class') # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 2) # switch to task with only one label tars.add_and_switch_to_new_task('1_CLASS', 'one class', "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 1) # switch to task with three labels provided as list tars.add_and_switch_to_new_task('3_CLASS', ['list 1', 'list 2', 'list 3'], "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 3) # switch to task with four labels provided as set tars.add_and_switch_to_new_task('4_CLASS', {'set 1', 'set 2', 'set 3', 'set 4'}, "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 4) # switch to task with two labels provided as Dictionary tars.add_and_switch_to_new_task('2_CLASS_AGAIN', corpus.make_label_dictionary(label_type='class'), "testlabel") # check if right number of classes assert (len(tars.get_current_label_dictionary()) == 2)
def run_splits(word_embeddings, embeddings_name): for i in range(1, 6): print('##########') print('Split', str(i)) print('##########') data_folder = '<path_to_splits>/split_' + str(i) + '/' corpus = ClassificationCorpus(data_folder, test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(data_folder + '/' + embeddings_name, max_epochs=150)
def train(self, learning_rate: float = 0.1, mini_batch_size: int = 16, anneal_factor: float = 0.5, patience: int = 5, max_epochs: int = 10): """ :return: """ self.make_corpus() corpus = ClassificationCorpus(self.output_data_path, train_file='train.txt', dev_file='dev.txt', test_file='test.txt') label_dictionary = corpus.make_label_dictionary() embeddings = [WordEmbeddings('glove')] document_pool = DocumentPoolEmbeddings(embeddings) classifier = TextClassifier(document_pool, label_dictionary=label_dictionary) trainer = ModelTrainer(classifier, corpus) trainer.train( self.model_path, learning_rate=learning_rate, mini_batch_size=mini_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, )
def test_train_tars(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore") # create a TARS classifier tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base") # switch to a new task (TARS can do multiple tasks so you must define one) tars.add_and_switch_to_new_task(task_name="question 2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type='class'), label_type='class', ) # initialize the text classifier trainer trainer = ModelTrainer(tars, corpus) # start the training trainer.train(base_path='resources/taggers/trec', # path to store the model artifacts learning_rate=0.02, # use very small learning rate mini_batch_size=1, # mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine max_epochs=1, # terminate after 10 epochs ) sentence = Sentence("This is great!") tars.predict(sentence)
def test_train_tars(tasks_base_path, results_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb_underscore") # create a TARS classifier tars = TARSClassifier(embeddings="sshleifer/tiny-distilroberta-base") # switch to a new task (TARS can do multiple tasks so you must define one) tars.add_and_switch_to_new_task( task_name="question 2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type="class"), label_type="class", ) # initialize the text classifier trainer trainer = ModelTrainer(tars, corpus) # start the training trainer.train( base_path=results_base_path, learning_rate=0.02, mini_batch_size=1, max_epochs=1, ) sentence = Sentence("This is great!") tars.predict(sentence)
def train_model(data_dir, max_epochs): st.write('Creating word corpus for training...') corpus = ClassificationCorpus(data_dir) label_dict = corpus.make_label_dictionary() st.write('Done') st.write('Load and create Embeddings for text data...') word_embeddings = [ WordEmbeddings('glove'), # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) st.write('Done') st.write('Preparing') classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus) trainer.train('model-saves', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=8, max_epochs=max_epochs, checkpoint=True) st.write('Model Training Finished!')
def test_text_classifier_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_single", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=TransformerDocumentEmbeddings( "distilbert-base-uncased"), label_dictionary=label_dict, label_type="city", multi_label=False, ) trainer = ModelTrainer(model, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-5, num_workers=2, ) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") loaded_model.predict([sentence, sentence_empty]) values = [] for label in sentence.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def test_text_classifier_multi(results_base_path, tasks_base_path): flair.set_seed(123) corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_multi", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=DocumentPoolEmbeddings([turian_embeddings], fine_tune_mode="linear"), label_dictionary=label_dict, label_type="city", multi_label=True, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=2, max_epochs=50, shuffle=True) # check if model can predict sentence = Sentence("this is Berlin") sentence_empty = Sentence(" ") model.predict(sentence) model.predict([sentence, sentence_empty]) model.predict([sentence_empty]) # load model loaded_model = TextClassifier.load(results_base_path / "final-model.pt") # chcek if model predicts correct label sentence = Sentence("this is Berlin") sentence_double = Sentence("this is Berlin and pizza") loaded_model.predict([sentence, sentence_double]) values = [] for label in sentence_double.labels: assert label.value is not None assert 0.0 <= label.score <= 1.0 assert type(label.score) is float values.append(label.value) assert "Berlin" in values assert "pizza" in values # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city") print(result.classification_report) assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def test_init_tars_and_switch(tasks_base_path): # test corpus corpus = ClassificationCorpus(tasks_base_path / "imdb") # create a TARS classifier tars = TARSClassifier( task_name="2_CLASS", label_dictionary=corpus.make_label_dictionary(label_type="class"), label_type="class", ) # check if right number of classes assert len(tars.get_current_label_dictionary()) == 2 # switch to task with only one label tars.add_and_switch_to_new_task("1_CLASS", "one class", "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 1 # switch to task with three labels provided as list tars.add_and_switch_to_new_task("3_CLASS", ["list 1", "list 2", "list 3"], "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 3 # switch to task with four labels provided as set tars.add_and_switch_to_new_task("4_CLASS", {"set 1", "set 2", "set 3", "set 4"}, "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 4 # switch to task with two labels provided as Dictionary tars.add_and_switch_to_new_task("2_CLASS_AGAIN", corpus.make_label_dictionary(label_type="class"), "testlabel") # check if right number of classes assert len(tars.get_current_label_dictionary()) == 2
def train_sentiment_model(rootdir, train, dev, test, num_epochs, device, outputdir): flair.device = torch.device(device) corpus = ClassificationCorpus(rootdir, train_file=train, dev_file=dev, test_file=test, in_memory=False) label_dict = corpus.make_label_dictionary() # init Flair embeddings flair_forward_embedding = FlairEmbeddings('multi-forward') flair_backward_embedding = FlairEmbeddings('multi-backward') optional_embedding = ELMoEmbeddings('original') word_embeddings = list(filter(None, [ optional_embedding, FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])) # Initialize document embedding by passing list of word embeddings # # Note this will kick off model generation that will take a long time (several hours) # This will produce final-model.pt and best-model.pt files which represent a stored trained model. document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(outputdir, max_epochs=num_epochs)
def _train_model(self): # type: () -> None corpus = ClassificationCorpus( Path(__path_to_base__), test_file=os.path.basename(self.path_to_test), dev_file=os.path.basename(self.path_to_dev), train_file=os.path.basename(self.path_to_train)) word_embeddings = [ ELMoEmbeddings('original'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(__path_to_base__, max_epochs=10)
def test_text_classifier_multi(results_base_path, tasks_base_path): flair.set_seed(123) flair_embeddings = FlairEmbeddings("news-forward-fast") corpus = ClassificationCorpus( tasks_base_path / "trivial" / "trivial_text_classification_single", label_type="city", ) label_dict = corpus.make_label_dictionary(label_type="city") model: TextClassifier = TextClassifier( document_embeddings=DocumentPoolEmbeddings([flair_embeddings], fine_tune_mode="linear"), label_dictionary=label_dict, label_type="city", ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, mini_batch_size=2, max_epochs=1, shuffle=True) del model train_log_file = results_base_path / "training.log" assert train_log_file.exists() lines = train_log_file.read_text(encoding="utf-8").split("\n") expected_substrings = [ "Device: ", "Corpus: ", "Parameters:", "- learning_rate: ", "- patience: ", "Embeddings storage mode:", "epoch 1 - iter", "EPOCH 1 done: loss", "Results:", ] for expected_substring in expected_substrings: assert any(expected_substring in line for line in lines), expected_substring
OptimizationValue, ) if __name__ == "__main__": data_folder = Path("..", "classification", "data", "downsampled", "flair") for c in ["dramen", "romane", "zeitung", "wikipedia"]: test_file = f"{c}-downsampled-test-flair.txt" dev_file = f"{c}-downsampled-val-flair.txt" train_file = f"{c}-downsampled-train-flair.txt" corpus = ClassificationCorpus(data_folder, test_file=test_file, dev_file=dev_file, train_file=train_file) label_dict = corpus.make_label_dictionary() search_space = SearchSpace() search_space.add( Parameter.EMBEDDINGS, hp.choice, options=[[BertEmbeddings("bert-base-german-cased")]], ) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "fasttext": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('it') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-uncased') elif stack == "bert-multi": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased') elif stack == 'bpe': from flair.embeddings import BytePairEmbeddings stacked_embedding = BytePairEmbeddings('it') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('it-forward'), FlairEmbeddings('it-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, dropout=0.5, reproject_words_dimension=256, ) #document_embeddings = DocumentPoolEmbeddings([ # stacked_embedding, # FlairEmbeddings('it-forward'), # FlairEmbeddings('it-backward')],pooling='mean') # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training #checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( file_path, max_epochs=n_epochs, checkpoint=True, ) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
word_embeddings = [ embedding, # FlairEmbeddings('news-forward',use_cache=True), # FlairEmbeddings('news-backward',use_cache=True), ] #apply document LSTM to the stacked embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, # hidden_size=512, # reproject_words=True, # reproject_words_dimension=256, ) #build model classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) #specify parameters and train model trainer.train(PATH/'models/', max_epochs=3,checkpoint=True, learning_rate=1e-1) classifier = TextClassifier.load('/content/drive/My Drive/emnlp/models/best-model.pt') """## Dev Set Prediction""" dev_folder = "" # if not adjust these variables accordingly dev_template_labels_file = "" task_SLC_output_file = ""