Пример #1
0
    def train(self,
              base_path: str,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              max_epochs: int = 50,
              anneal_factor: float = 0.5,
              patience: int = 5,
              train_with_dev: bool = False,
              embeddings_in_memory: bool = False,
              checkpoint: bool = False,
              save_final_model: bool = True,
              anneal_with_restarts: bool = False,
              eval_on_train: bool = True):
        """
        Trains a text classification model using the training data of the corpus.
        :param base_path: the directory to which any results should be written to
        :param learning_rate: the learning rate
        :param mini_batch_size: the mini batch size
        :param max_epochs: the maximum number of epochs to train
        :param anneal_factor: learning rate will be decreased by this factor
        :param patience: number of 'bad' epochs before learning rate gets decreased
        :param train_with_dev: boolean indicating, if the dev data set should be used for training or not
        :param embeddings_in_memory: boolean indicating, if embeddings should be kept in memory or not
        :param checkpoint: boolean indicating, whether the model should be save after every epoch or not
        :param save_final_model: boolean indicating, whether the final model should be saved or not
        :param anneal_with_restarts: boolean indicating, whether the best model should be reloaded once the learning
        rate changed or not
        :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set
        or not
        """

        loss_txt = init_output_file(base_path, 'loss.tsv')
        with open(loss_txt, 'a') as f:
            f.write(
                'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n'
                .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'),
                        Metric.tsv_header('TEST')))

        weight_extractor = WeightExtractor(base_path)

        optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)

        anneal_mode = 'min' if train_with_dev else 'max'
        scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer,
                                                         factor=anneal_factor,
                                                         patience=patience,
                                                         mode=anneal_mode)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for epoch in range(max_epochs):
                log.info('-' * 100)

                bad_epochs = scheduler.num_bad_epochs
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts and \
                        os.path.exists(base_path + "/best-model.pt"):
                    log.info('Resetting to best model ...')
                    self.model.load_from_file(base_path + "/best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.001:
                    log.info('Learning rate too small - quitting training!')
                    break

                if not self.test_mode:
                    random.shuffle(train_data)

                self.model.train()

                batches = [
                    self.corpus.train[x:x + mini_batch_size]
                    for x in range(0, len(self.corpus.train), mini_batch_size)
                ]

                current_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    scores = self.model.forward(batch)
                    loss = self.model.calculate_loss(scores, batch)

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_sentences += len(batch)
                    current_loss += loss.item()

                    clear_embeddings(
                        batch,
                        also_clear_word_embeddings=not embeddings_in_memory)

                    if batch_no % modulo == 0:
                        log.info(
                            "epoch {0} - iter {1}/{2} - loss {3:.8f}".format(
                                epoch + 1, batch_no, len(batches),
                                current_loss / seen_sentences))
                        iteration = epoch * len(batches) + batch_no
                        weight_extractor.extract_weights(
                            self.model.state_dict(), iteration)

                current_loss /= len(train_data)

                self.model.eval()

                # if checkpoint is enable, save model at each epoch
                if checkpoint:
                    self.model.save(base_path + "/checkpoint.pt")

                log.info('-' * 100)
                log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format(
                    epoch + 1, learning_rate, bad_epochs))

                dev_metric = train_metric = None
                dev_loss = '_'
                train_loss = current_loss

                if eval_on_train:
                    train_metric, train_loss = self._calculate_evaluation_results_for(
                        'TRAIN', self.corpus.train, embeddings_in_memory,
                        mini_batch_size)

                if not train_with_dev:
                    dev_metric, dev_loss = self._calculate_evaluation_results_for(
                        'DEV', self.corpus.dev, embeddings_in_memory,
                        mini_batch_size)

                with open(loss_txt, 'a') as f:
                    train_metric_str = train_metric.to_tsv(
                    ) if train_metric is not None else Metric.to_empty_tsv()
                    dev_metric_str = dev_metric.to_tsv(
                    ) if dev_metric is not None else Metric.to_empty_tsv()
                    f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        epoch, datetime.datetime.now(), train_loss,
                        train_metric_str, dev_loss, dev_metric_str, '_',
                        Metric.to_empty_tsv()))

                # anneal against train loss if training with dev, otherwise anneal against dev score
                scheduler.step(
                    current_loss) if train_with_dev else scheduler.step(
                        dev_metric.f_score())

                current_score = dev_metric.f_score(
                ) if not train_with_dev else train_metric.f_score()

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and current_score == scheduler.best:
                    self.model.save(base_path + "/best-model.pt")

            if save_final_model:
                self.model.save(base_path + "/final-model.pt")

            log.info('-' * 100)
            log.info('Testing using best model ...')

            self.model.eval()

            if os.path.exists(base_path + "/best-model.pt"):
                self.model = TextClassifier.load_from_file(base_path +
                                                           "/best-model.pt")

            test_metric, test_loss = self.evaluate(
                self.corpus.test,
                mini_batch_size=mini_batch_size,
                eval_class_metrics=True,
                embeddings_in_memory=embeddings_in_memory,
                metric_name='TEST')

            test_metric.print()
            self.model.train()

            log.info('-' * 100)

        except KeyboardInterrupt:
            log.info('-' * 100)
            log.info('Exiting from training early.')
            log.info('Saving model ...')
            with open(base_path + "/final-model.pt", 'wb') as model_save_file:
                torch.save(self.model, model_save_file, pickle_protocol=4)
                model_save_file.close()
            log.info('Done.')
Пример #2
0
    def train(
        self,
        base_path: str,
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        max_epochs: int = 100,
        anneal_factor: float = 0.5,
        patience: int = 4,
        train_with_dev: bool = False,
        embeddings_in_memory: bool = True,
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
    ):

        evaluation_method = 'F1'
        if self.model.tag_type in ['pos', 'upos']:
            evaluation_method = 'accuracy'
        log.info('Evaluation method: {}'.format(evaluation_method))

        loss_txt = init_output_file(base_path, 'loss.tsv')
        with open(loss_txt, 'a') as f:
            f.write(
                'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n'
                .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'),
                        Metric.tsv_header('TEST')))

        weight_extractor = WeightExtractor(base_path)

        optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)

        # annealing scheduler
        anneal_mode = 'min' if train_with_dev else 'max'
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=anneal_factor,
                                      patience=patience,
                                      mode=anneal_mode,
                                      verbose=True)

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:

            previous_learning_rate = learning_rate

            for epoch in range(0, max_epochs):
                log.info('-' * 100)

                bad_epochs = scheduler.num_bad_epochs
                for group in optimizer.param_groups:
                    learning_rate = group['lr']

                # reload last best model if annealing with restarts is enabled
                if learning_rate != previous_learning_rate and anneal_with_restarts and \
                        os.path.exists(base_path + "/best-model.pt"):
                    log.info('resetting to best model')
                    self.model.load_from_file(base_path + "/best-model.pt")

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < 0.001:
                    log.info('learning rate too small - quitting training!')
                    break

                if not self.test_mode:
                    random.shuffle(train_data)

                batches = [
                    train_data[x:x + mini_batch_size]
                    for x in range(0, len(train_data), mini_batch_size)
                ]

                self.model.train()

                current_loss: float = 0
                seen_sentences = 0
                modulo = max(1, int(len(batches) / 10))

                for batch_no, batch in enumerate(batches):
                    batch: List[Sentence] = batch

                    optimizer.zero_grad()

                    # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step()
                    loss = self.model.neg_log_likelihood(batch)

                    current_loss += loss.item()
                    seen_sentences += len(batch)

                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    if not embeddings_in_memory:
                        self.clear_embeddings_in_batch(batch)

                    if batch_no % modulo == 0:
                        log.info(
                            "epoch {0} - iter {1}/{2} - loss {3:.8f}".format(
                                epoch + 1, batch_no, len(batches),
                                current_loss / seen_sentences))
                        iteration = epoch * len(batches) + batch_no
                        weight_extractor.extract_weights(
                            self.model.state_dict(), iteration)

                current_loss /= len(train_data)

                # switch to eval mode
                self.model.eval()

                # if checkpointing is enable, save model at each epoch
                if checkpoint:
                    self.model.save(base_path + "/checkpoint.pt")

                log.info('-' * 100)

                dev_score = dev_metric = None
                if not train_with_dev:
                    dev_score, dev_metric = self.evaluate(
                        self.corpus.dev,
                        base_path,
                        evaluation_method=evaluation_method,
                        embeddings_in_memory=embeddings_in_memory)

                test_score, test_metric = self.evaluate(
                    self.corpus.test,
                    base_path,
                    evaluation_method=evaluation_method,
                    embeddings_in_memory=embeddings_in_memory)

                # anneal against train loss if training with dev, otherwise anneal against dev score
                scheduler.step(
                    current_loss) if train_with_dev else scheduler.step(
                        dev_score)

                # logging info
                log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format(
                    epoch + 1, learning_rate, bad_epochs))
                if not train_with_dev:
                    log.info(
                        "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
                        .format('DEV', dev_metric.f_score(),
                                dev_metric.accuracy(), dev_metric._tp,
                                dev_metric._fp, dev_metric._fn,
                                dev_metric._tn))
                log.info(
                    "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}"
                    .format('TEST', test_metric.f_score(),
                            test_metric.accuracy(), test_metric._tp,
                            test_metric._fp, test_metric._fn, test_metric._tn))

                with open(loss_txt, 'a') as f:
                    dev_metric_str = dev_metric.to_tsv(
                    ) if dev_metric is not None else Metric.to_empty_tsv()
                    f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        epoch, datetime.datetime.now(), '_',
                        Metric.to_empty_tsv(), '_', dev_metric_str, '_',
                        test_metric.to_tsv()))

                # if we use dev data, remember best model based on dev evaluation score
                if not train_with_dev and dev_score == scheduler.best:
                    self.model.save(base_path + "/best-model.pt")

            # if we do not use dev data for model selection, save final model
            if save_final_model:
                self.model.save(base_path + "/final-model.pt")

        except KeyboardInterrupt:
            log.info('-' * 100)
            log.info('Exiting from training early.')
            log.info('Saving model ...')
            self.model.save(base_path + "/final-model.pt")
            log.info('Done.')
Пример #3
0
from __future__ import absolute_import