def train( self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 4, train_with_dev: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, ): evaluation_method = 'F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' log.info('Evaluation method: {}'.format(evaluation_method)) loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) # annealing scheduler anneal_mode = 'min' if train_with_dev else 'max' scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0, max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('resetting to best model') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): batch: List[Sentence] = batch optimizer.zero_grad() # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() loss = self.model.neg_log_likelihood(batch) current_loss += loss.item() seen_sentences += len(batch) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) # switch to eval mode self.model.eval() # if checkpointing is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) dev_score = dev_metric = None if not train_with_dev: dev_score, dev_metric = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) test_score, test_metric = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # logging info log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) if not train_with_dev: log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('DEV', dev_metric.f_score(), dev_metric.accuracy(), dev_metric._tp, dev_metric._fp, dev_metric._fn, dev_metric._tn)) log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('TEST', test_metric.f_score(), test_metric.accuracy(), test_metric._tp, test_metric._fp, test_metric._fn, test_metric._tn)) with open(loss_txt, 'a') as f: dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), '_', Metric.to_empty_tsv(), '_', dev_metric_str, '_', test_metric.to_tsv())) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and dev_score == scheduler.best: self.model.save(base_path + "/best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model: self.model.save(base_path + "/final-model.pt") except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') self.model.save(base_path + "/final-model.pt") log.info('Done.')
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 50, anneal_factor: float = 0.5, patience: int = 5, train_with_dev: bool = False, embeddings_in_memory: bool = False, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, eval_on_train: bool = True): """ Trains a text classification model using the training data of the corpus. :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param anneal_factor: learning rate will be decreased by this factor :param patience: number of 'bad' epochs before learning rate gets decreased :param train_with_dev: boolean indicating, if the dev data set should be used for training or not :param embeddings_in_memory: boolean indicating, if embeddings should be kept in memory or not :param checkpoint: boolean indicating, whether the model should be save after every epoch or not :param save_final_model: boolean indicating, whether the final model should be saved or not :param anneal_with_restarts: boolean indicating, whether the best model should be reloaded once the learning rate changed or not :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set or not """ loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(max_epochs): log.info('-' * 100) bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ os.path.exists(base_path + "/best-model.pt"): log.info('Resetting to best model ...') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('Learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) self.model.train() batches = [ self.corpus.train[x:x + mini_batch_size] for x in range(0, len(self.corpus.train), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) self.model.eval() # if checkpoint is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) dev_metric = train_metric = None dev_loss = '_' train_loss = current_loss if eval_on_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, embeddings_in_memory, mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, embeddings_in_memory, mini_batch_size) with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv() dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), train_loss, train_metric_str, dev_loss, dev_metric_str, '_', Metric.to_empty_tsv())) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_metric.f_score()) current_score = dev_metric.f_score( ) if not train_with_dev else train_metric.f_score() # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and current_score == scheduler.best: self.model.save(base_path + "/best-model.pt") if save_final_model: self.model.save(base_path + "/final-model.pt") log.info('-' * 100) log.info('Testing using best model ...') self.model.eval() if os.path.exists(base_path + "/best-model.pt"): self.model = TextClassifier.load_from_file(base_path + "/best-model.pt") test_metric, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory, metric_name='TEST') test_metric.print() self.model.train() log.info('-' * 100) except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() log.info('Done.')
def train(self, base_path: Union[Path, str], evaluation_metric: EvaluationMetric = EvaluationMetric. MICRO_F1_SCORE, learning_rate: float = 0.1, mini_batch_size: int = 32, eval_mini_batch_size: int = None, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 3, anneal_against_train_loss: bool = True, train_with_dev: bool = False, monitor_train: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, test_mode: bool = False, param_selection_mode: bool = False, **kwargs) -> dict: if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) add_file_handler(log, base_path / 'training.log') log_line(log) log.info(f'Evaluation method: {evaluation_metric.name}') if not param_selection_mode: loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}' f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n') weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # annealing scheduler anneal_mode = 'min' if anneal_against_train_loss else 'max' if isinstance(optimizer, (AdamW, SGDW)): scheduler = ReduceLRWDOnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) else: scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) dev_score_history = [] dev_loss_history = [] train_loss_history = [] # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and \ (base_path / 'best-model.pt').exists(): log.info('resetting to best model') self.model.load_from_file(base_path / 'best-model.pt') previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.0001: log_line(log) log.info('learning rate too small - quitting training!') log_line(log) break if not test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() train_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) train_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss ' f'{train_loss / seen_sentences:.8f}') iteration = epoch * len(batches) + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= len(train_data) self.model.eval() log_line(log) log.info( f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}' ) dev_metric = None dev_loss = '_' train_metric = None if monitor_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not param_selection_mode: test_metric, test_loss = self._calculate_evaluation_results_for( 'TEST', self.corpus.test, evaluation_metric, embeddings_in_memory, eval_mini_batch_size, base_path / 'test.tsv') if not param_selection_mode: with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv( ) dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() test_metric_str = test_metric.to_tsv( ) if test_metric is not None else Metric.to_empty_tsv( ) f.write( f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t' f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n' ) # calculate scores using dev data if available dev_score = 0. if not train_with_dev: if evaluation_metric == EvaluationMetric.MACRO_ACCURACY: dev_score = dev_metric.macro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY: dev_score = dev_metric.micro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE: dev_score = dev_metric.macro_avg_f_score() else: dev_score = dev_metric.micro_avg_f_score() # append dev score to score history dev_score_history.append(dev_score) dev_loss_history.append(dev_loss.item()) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss if anneal_against_train_loss else dev_score scheduler.step(current_score) train_loss_history.append(train_loss) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint(base_path / 'checkpoint.pt', optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and not param_selection_mode and current_score == scheduler.best: self.model.save(base_path / 'best-model.pt') # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / 'final-model.pt') except KeyboardInterrupt: log_line(log) log.info('Exiting from training early.') if not param_selection_mode: log.info('Saving model ...') self.model.save(base_path / 'final-model.pt') log.info('Done.') # test best model on test data final_score = self.final_test(base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size) return { 'test_score': final_score, 'dev_score_history': dev_score_history, 'train_loss_history': train_loss_history, 'dev_loss_history': dev_loss_history }
from __future__ import absolute_import
from pathlib import Path