def test_train_resume_text_classification_training(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") label_dict = corpus.make_label_dictionary() #document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( # [flair_embeddings], 128, 1, False #) model = TextClassifier(document_embeddings, label_dict, multi_label=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def test_train_resume_classifier(results_base_path, tasks_base_path): corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb", label_type="topic") label_dict = corpus.make_label_dictionary(label_type="topic") model = TextClassifier(document_embeddings=document_embeddings, label_dictionary=label_dict, multi_label=False, label_type="topic") trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 2: "ner" }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del trainer, model trainer = ModelTrainer.load_checkpoint(results_base_path / "checkpoint.pt", corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path) del trainer
def train(args, tag_type): ''' Training script to be run for training the ner model Parameters: ----------- args:arguments passed to the parser on CLI ''' data_dir = args.input_dir + '/data' corpus = ColumnCorpus(data_folder=data_dir, column_format={ 0: 'text', 1: 'ner' }, train_file=args.train_file, test_file=args.test_file, dev_file=args.dev_file) # print(corpus.train[0]) # print(corpus) # tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # print(tag_dictionary) if args.character_embeddings: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] else: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), FlairEmbeddings(args.flair_model_name_or_path_forward), FlairEmbeddings(args.flair_model_name_or_path_backward), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if (args.train_or_predict == "continue_train"): print("continue training") checkpoint = '/Users/titashneogi/workspace/NLP/NER/data/flair/cumulative_model/checkpoint.pt' trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # start training trainer.train(args.model_dir, learning_rate=args.train_learning_rate, mini_batch_size=args.per_gpu_batch_size, max_epochs=args.num_train_epochs, embeddings_storage_mode=args.embeddings_storage_mode) model = SequenceTagger.load(args.model_dir + '/final-model.pt') if (args.predict_file): with open(data_dir + args.predict_file, 'r') as f: str_file = f.read() sentence = Sentence(str_file) model.predict(sentence) print(sentence.to_tagged_string())
def train( review_category, params, update_model= False, learning_rate=0.01, embeddings_storage_mode='gpu', checkpoint= True, batch_growth_annealing= True, weight_decay = 1e-4, shuffle=True, train_with_dev=True, mini_batch_size=2, maxi_batch_size=128, anneal_factor=0.5, patience=2, max_epochs=150 ): review_category = str(review_category) print('loading training corpus from %s'%(params.data_folder)) corpus: Corpus = ClassificationCorpus(params.data_folder, train_file= review_category+'_train.txt', test_file= review_category+'_test.txt', dev_file= review_category+'_dev.txt') label_dict = corpus.make_label_dictionary() print('labels: ',label_dict) if eval(params.transformer): print('initializing transformer document embeddings using %s ...'%(params.transformer_pretrain_lm)) # 3. initialize transformer document embeddings (many models are available) document_embeddings = TransformerDocumentEmbeddings(params.transformer_pretrain_lm, fine_tune=True) else: print('initializing document embeddings') word_embeddings= [ WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings(), # TransformerXLEmbeddings(), #RoBERTaEmbeddings(), #XLNetEmbeddings() ] # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) if not update_model: print('building review_analysis classifier ...') # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer print("initializing review_analysis classifier's trainer") trainer = ModelTrainer(classifier, corpus, optimizer=Adam) else: # continue trainer at later point checkpoint_path = params.checkpoint_dir+'/%s/checkpoint.pt'%(review_category) print('loading checkpoint from %s'%(checkpoint_path)) trainer = ModelTrainer.load_checkpoint(checkpoint_path, corpus) ####### training the model print("training the review_category: %s model ..."%(review_category)) try: trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs) except: print('chuncking batch ... by %d'%(params.mini_batch_chunk_size)) trainer.train(params.checkpoint_dir+'/%s'%(review_category), learning_rate=learning_rate, embeddings_storage_mode=embeddings_storage_mode, checkpoint= checkpoint, batch_growth_annealing= batch_growth_annealing, weight_decay = weight_decay, shuffle=shuffle, train_with_dev=train_with_dev, mini_batch_size=mini_batch_size, maxi_batch_size=maxi_batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, mini_batch_chunk_size=params.mini_batch_chunk_size)
pre_process(down_sample=1.0, equal_sets=True) corpus = CSVClassificationCorpus(data_folder='data', column_name_map={ 0: "label", 1: "text" }, delimiter='\t', skip_header=True, test_file='test.tsv', dev_file='dev.tsv', train_file='train.tsv') if path.isfile('results/checkpoint.pt'): print("Starting from checkpoint") trainer = ModelTrainer.load_checkpoint('results/checkpoint.pt', corpus) else: word_embeddings = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] document_embeddings = DocumentRNNEmbeddings( embeddings=word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, rnn_type="LSTM") weights = load_weights() classifier = TextClassifier( document_embeddings=document_embeddings,
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "fasttext": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('it') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-uncased') elif stack == "bert-multi": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased') elif stack == 'bpe': from flair.embeddings import BytePairEmbeddings stacked_embedding = BytePairEmbeddings('it') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('it-forward'), FlairEmbeddings('it-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, dropout=0.5, reproject_words_dimension=256, ) #document_embeddings = DocumentPoolEmbeddings([ # stacked_embedding, # FlairEmbeddings('it-forward'), # FlairEmbeddings('it-backward')],pooling='mean') # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training #checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( file_path, max_epochs=n_epochs, checkpoint=True, ) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
document_embeddings = DocumentPoolEmbeddings( word_embeddings, pooling='mean', fine_tune_mode='nonlinear', ) # choose classifier type tagger: TextClassifier = TextClassifier( document_embeddings=document_embeddings, label_dictionary=label_dictionary, multi_label=False) # define model checkpoint = os.path.join(tagger_output_directory, 'checkpoint.pt') if os.path.isfile(checkpoint): trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) else: trainer = ModelTrainer(tagger, corpus) # train model trainer.train( tagger_output_directory, checkpoint=True, learning_rate=0.7, dropout=0.25, mini_batch_size= 64, # decrease to prevent graphic card memory errors. Increase to improve learning speed monitor_test=False, monitor_train=False, patience= 2, # after how many unsuccessful epochs should we start annealing the learning rate
column_name_map={ 0: 'label', 1: 'text' }, skip_header=False, delimiter='\t', in_memory=False, max_tokens_per_doc=1000 * 10) import torch import flair flair.devide = torch.device('cuda:0') from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.datasets import DataLoader mt = ModelTrainer.load_checkpoint(os.path.join(logs_path, "checkpoint.pt"), corpus) test_results, test_loss = mt.model.evaluate( DataLoader( corpus.test, batch_size=4, num_workers=4, ), out_path=os.path.join(logs_path, "test.tsv"), embedding_storage_mode="none", ) with open(os.path.join(logs_path, "test.txt"), "w") as f: f.write(str(test_results.main_score) + "\n\n") f.write(str(test_results.log_header) + "\n") f.write(str(test_results.log_line) + "\n\n")