示例#1
0
文件: train.py 项目: ttslr/BVAE-TTS
def main(args):
    train_loader, val_loader, collate_fn = prepare_dataloaders(hp)
    model = Model(hp).cuda()
    optimizer = torch.optim.Adamax(model.parameters(), lr=hp.lr)
    writer = get_writer(hp.output_directory, args.logdir)
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    iteration = 0
    model.train()
    print(f"Training Start!!! ({args.logdir})")
    while iteration < (hp.train_steps):
        for i, batch in enumerate(train_loader):
            text_padded, text_lengths, mel_padded, mel_lengths = [ x.cuda() for x in batch ]
            recon_loss, kl_loss, duration_loss, align_loss = model(text_padded, mel_padded, text_lengths, mel_lengths)

            alpha=min(1, iteration/hp.kl_warmup_steps)
            with amp.scale_loss((recon_loss + alpha*kl_loss + duration_loss + align_loss), optimizer) as scaled_loss:
                scaled_loss.backward()

            iteration += 1
            lr_scheduling(optimizer, iteration)
            nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh)
            optimizer.step()
            model.zero_grad()
            writer.add_scalar('train_recon_loss', recon_loss, global_step=iteration)
            writer.add_scalar('train_kl_loss', kl_loss, global_step=iteration)
            writer.add_scalar('train_duration_loss', duration_loss, global_step=iteration)
            writer.add_scalar('train_align_loss', align_loss, global_step=iteration)

            if iteration % (hp.iters_per_validation) == 0:
                validate(model, val_loader, iteration, writer)

            if iteration % (hp.iters_per_checkpoint) == 0:
                save_checkpoint(model, optimizer, hp.lr, iteration, filepath=f'{hp.output_directory}/{args.logdir}')

            if iteration == (hp.train_steps):
                break
示例#2
0
device = "cpu"
raw_data_path = None
processed_data_path = None
n_epochs = 10
feature_selector = ExampleFeatureSelector()
train_data = BioactivityData(raw_data_path, processed_data_path,
                             feature_selector)
train_loader = DataLoader(train_data, batch_size=16, shuffle=False)

valid_data = None
valid_loader = None

input_size = train_data[0][0].shape[0]
model = Model(input_size, dim=200, n_res_blocks=2).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()


def training_epoch(loader, model, opt, loss_fn):
    for iter, (x, y) in loader:
        x, y = x.to(device), y.to(device)
        pred = model(x)

        loss = loss_fn(pred, y)

        opt.zero_grad()
        loss.backward()
        opt.step()

示例#3
0
def main(args):
    train_loader, val_loader, collate_fn = prepare_dataloaders(hparams, stage=args.stage)

    if args.stage!=0:
        checkpoint_path = f"training_log/aligntts/stage{args.stage-1}/checkpoint_{hparams.train_steps[args.stage-1]}"
        state_dict = {}
        for k, v in torch.load(checkpoint_path)['state_dict'].items():
            state_dict[k[7:]]=v

        model = Model(hparams).cuda()
        model.load_state_dict(state_dict)
        model = nn.DataParallel(model).cuda()
    else:
        model = nn.DataParallel(Model(hparams)).cuda()

    criterion = MDNLoss()
    writer = get_writer(hparams.output_directory, f'{hparams.log_directory}/stage{args.stage}')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=hparams.lr,
                                 betas=(0.9, 0.98),
                                 eps=1e-09)
    iteration, loss = 0, 0
    model.train()

    print(f'Stage{args.stage} Start!!! ({str(datetime.now())})')
    while True:
        for i, batch in enumerate(train_loader):
            if args.stage==0:
                text_padded, mel_padded, text_lengths, mel_lengths = [
                    reorder_batch(x, hparams.n_gpus).cuda() for x in batch
                ]
                align_padded=None
            else:
                text_padded, mel_padded, align_padded, text_lengths, mel_lengths = [
                    reorder_batch(x, hparams.n_gpus).cuda() for x in batch
                ]

            sub_loss = model(text_padded,
                             mel_padded,
                             align_padded,
                             text_lengths,
                             mel_lengths,
                             criterion,
                             stage=args.stage)
            sub_loss = sub_loss.mean()/hparams.accumulation
            sub_loss.backward()
            loss = loss+sub_loss.item()
            iteration += 1

            if iteration%hparams.accumulation == 0:
                lr_scheduling(optimizer, iteration//hparams.accumulation)
                nn.utils.clip_grad_norm_(model.parameters(), hparams.grad_clip_thresh)
                optimizer.step()
                model.zero_grad()
                writer.add_scalar('Train loss', loss, iteration//hparams.accumulation)
                loss=0

            if iteration%(hparams.iters_per_validation*hparams.accumulation)==0:
                validate(model, criterion, val_loader, iteration, writer, args.stage)

            if iteration%(hparams.iters_per_checkpoint*hparams.accumulation)==0:
                save_checkpoint(model,
                                optimizer,
                                hparams.lr,
                                iteration//hparams.accumulation,
                                filepath=f'{hparams.output_directory}/{hparams.log_directory}/stage{args.stage}')

            if iteration==(hparams.train_steps[args.stage]*hparams.accumulation):
                break

        if iteration==(hparams.train_steps[args.stage]*hparams.accumulation):
            break
            
    print(f'Stage{args.stage} End!!! ({str(datetime.now())})')
示例#4
0
def train(train_file, validation_file, batch_size, epoch_limit, file_name,
          gpu_mode):

    transformations = transforms.Compose([transforms.ToTensor()])

    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)
    train_data_set = PileupDataset(train_file, transformations)
    train_loader = DataLoader(train_data_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=16,
                              pin_memory=gpu_mode)
    sys.stderr.write(TextColor.PURPLE + 'Data loading finished\n' +
                     TextColor.END)

    model = Model()
    if gpu_mode:
        model = torch.nn.DataParallel(model).cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

    # Train the Model
    sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END)
    seq_len = 3
    iteration_jump = 1
    for epoch in range(epoch_limit):
        total_loss = 0
        total_images = 0
        total_could_be = 0
        for i, (images, labels) in enumerate(train_loader):
            hidden = model.init_hidden(images.size(0))
            # if batch size not distributable among all GPUs then skip
            if gpu_mode is True and images.size(0) % 8 != 0:
                continue

            images = Variable(images, requires_grad=False)
            labels = Variable(labels, requires_grad=False)
            if gpu_mode:
                images = images.cuda()
                labels = labels.cuda()

            for row in range(0, images.size(2), iteration_jump):
                # segmentation of image. Currently using seq_len
                if row + seq_len > images.size(2):
                    continue

                x = images[:, :, row:row + seq_len, :]
                y = labels[:, row:row + seq_len]

                total_variation = torch.sum(y).data[0]
                total_could_be += batch_size
                # print(total_variation)

                if total_variation == 0 and random.uniform(0, 1) * 100 > 5:
                    continue
                elif random.uniform(0,
                                    1) < total_variation / batch_size < 0.02:
                    continue

                # print(x)
                # print(y)
                # exit()

                # Forward + Backward + Optimize
                optimizer.zero_grad()
                outputs = model(x, hidden)
                hidden = repackage_hidden(hidden)
                # print('Label: ', y.data[0])
                # print('Values:', outputs.data[0])
                # print(y.contiguous().view(-1))
                # exit()
                # outputs = outputs.view(1, outputs.size(0), -1) required for CTCLoss

                loss = criterion(outputs.contiguous().view(-1, 3),
                                 y.contiguous().view(-1))
                # print(outputs.contiguous().view(-1, 3).size())
                # print(y.contiguous().view(-1).size())
                # exit()
                loss.backward()
                optimizer.step()

                # loss count
                total_images += batch_size
                total_loss += loss.data[0]

            sys.stderr.write(TextColor.BLUE + "EPOCH: " + str(epoch) +
                             " Batches done: " + str(i + 1))
            sys.stderr.write(" Loss: " + str(total_loss / total_images) +
                             "\n" + TextColor.END)
            print(
                str(epoch) + "\t" + str(i + 1) + "\t" +
                str(total_loss / total_images))

        # After each epoch do validation
        validate(validation_file, batch_size, gpu_mode, model, seq_len)
        sys.stderr.write(TextColor.YELLOW + 'Could be: ' +
                         str(total_could_be) + ' Chosen: ' +
                         str(total_images) + "\n" + TextColor.END)
        sys.stderr.write(TextColor.YELLOW + 'EPOCH: ' + str(epoch))
        sys.stderr.write(' Loss: ' + str(total_loss / total_images) + "\n" +
                         TextColor.END)
        torch.save(model, file_name + '_checkpoint_' + str(epoch) + '.pkl')
        torch.save(
            model.state_dict(),
            file_name + '_checkpoint_' + str(epoch) + '-params' + '.pkl')

    sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)
    torch.save(model, file_name + '_final.pkl')

    sys.stderr.write(TextColor.PURPLE + 'Model saved as:' + file_name +
                     '.pkl\n' + TextColor.END)
    torch.save(model.state_dict(), file_name + '_final_params' + '.pkl')

    sys.stderr.write(TextColor.PURPLE + 'Model parameters saved as:' +
                     file_name + '-params.pkl\n' + TextColor.END)
示例#5
0
def training_process(device, nb_class_labels, model_path, result_dir, patience,
                     epochs, do_pre_train, tr_feat_path, tr_labels_path,
                     val_feat_path, val_labels_path, tr_batch_size,
                     val_batch_size, adapt_patience, adapt_epochs, d_lr,
                     tgt_lr, update_cnt, factor):
    """Implements the complete training process of the AUDASC method.

    :param device: The device that we will use.
    :type device: str
    :param nb_class_labels: The amount of labels for label classification.
    :type nb_class_labels: int
    :param model_path: The path of previously saved model (if any)
    :type model_path: str
    :param result_dir: The directory to save newly pre-trained model.
    :type result_dir: str
    :param patience: The patience for the pre-training step.
    :type patience: int
    :param epochs: The epochs for the pre-training step.
    :type epochs: int
    :param do_pre_train: Flag to indicate if we do pre-training.
    :type do_pre_train: bool
    :param tr_feat_path: The path for loading the training features.
    :type tr_feat_path: str
    :param tr_labels_path: The path for loading the training labels.
    :type tr_labels_path: str
    :param val_feat_path: The path for loading the validation features.
    :type val_feat_path: str
    :param val_labels_path: The path for loading the validation labels.
    :type val_labels_path: str
    :param tr_batch_size: The batch used for pre-training.
    :type tr_batch_size: int
    :param val_batch_size: The batch size used for validation.
    :type val_batch_size: int
    :param adapt_patience: The patience for the domain adaptation step.
    :type adapt_patience: int
    :param adapt_epochs: The epochs for the domain adaptation step.
    :type adapt_epochs: int
    :param d_lr: The learning rate for the discriminator.
    :type d_lr: float
    :param tgt_lr: The learning rate for the adapted model.
    :type tgt_lr: float
    :param update_cnt: An update controller for adversarial loss
    :type update_cnt: int
    :param factor: the coefficient used to be multiplied by classification loss.
    :type factor: int
    """

    tr_feat = device_exchange(file_io.load_pickled_features(tr_feat_path),
                              device=device)
    tr_labels = device_exchange(file_io.load_pickled_features(tr_labels_path),
                                device=device)
    val_feat = device_exchange(file_io.load_pickled_features(val_feat_path),
                               device=device)
    val_labels = device_exchange(
        file_io.load_pickled_features(val_labels_path), device=device)

    loss_func = functional.cross_entropy

    non_adapted_cnn = Model().to(device)
    label_classifier = LabelClassifier(nb_class_labels).to(device)

    if not path.exists(result_dir):
        makedirs(result_dir)

    if do_pre_train:
        state_dict_path = result_dir

        printing.info_msg('Pre-training step')

        optimizer_source = torch.optim.Adam(
            list(non_adapted_cnn.parameters()) +
            list(label_classifier.parameters()),
            lr=1e-4)

        pre_training.pre_training(model=non_adapted_cnn,
                                  label_classifier=label_classifier,
                                  optimizer=optimizer_source,
                                  tr_batch_size=tr_batch_size,
                                  val_batch_size=val_batch_size,
                                  tr_feat=tr_feat['A'],
                                  tr_labels=tr_labels['A'],
                                  val_feat=val_feat['A'],
                                  val_labels=val_labels['A'],
                                  epochs=epochs,
                                  criterion=loss_func,
                                  patience=patience,
                                  result_dir=state_dict_path)

        del optimizer_source

    else:
        printing.info_msg('Loading a pre-trained non-adapted model')
        state_dict_path = model_path

    if not path.exists(state_dict_path):
        raise ValueError(
            'The path for loading the pre trained model does not exist!')

    non_adapted_cnn.load_state_dict(
        torch.load(path.join(state_dict_path, 'non_adapted_cnn.pytorch')))
    label_classifier.load_state_dict(
        torch.load(path.join(state_dict_path, 'label_classifier.pytorch')))

    printing.info_msg('Training the Adversarial Adaptation Model')

    target_cnn = Model().to(device)
    target_cnn.load_state_dict(non_adapted_cnn.state_dict())
    discriminator = Discriminator(2).to(device)

    target_model_opt = torch.optim.Adam(target_cnn.parameters(), lr=tgt_lr)
    discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=d_lr)

    domain_adaptation.domain_adaptation(
        non_adapted_cnn, target_cnn, label_classifier, discriminator,
        target_model_opt, discriminator_opt, loss_func, loss_func, loss_func,
        tr_feat, tr_labels, val_feat, val_labels, adapt_epochs, update_cnt,
        result_dir, adapt_patience, device, factor)
示例#6
0
class BERTable():
    def __init__(self,
                 df,
                 column_type,
                 embedding_dim=5,
                 n_layers=5,
                 dim_feedforward=100,
                 n_head=5,
                 dropout=0.15,
                 ns_exponent=0.75,
                 share_category=False,
                 use_pos=False,
                 device='cpu'):

        self.logger = create_logger(name="BERTable")

        self.col_type = {'numerical': [], 'categorical': [], 'vector': []}
        for i, data_type in enumerate(column_type):
            self.col_type[data_type].append(i)

        self.embedding_dim = embedding_dim
        self.use_pos = use_pos
        self.device = device

        self.vocab = Vocab(df, self.col_type, share_category, ns_exponent)

        vocab_size = {
            'numerical': len(self.vocab.item2idx['numerical']),
            'categorical': len(self.vocab.item2idx['categorical'])
        }

        vector_dims = [np.shape(df[col])[1] for col in self.col_type['vector']]
        tab_len = len(column_type)
        self.model = Model(vocab_size, self.col_type, use_pos, vector_dims,
                           embedding_dim, dim_feedforward, tab_len, n_layers,
                           n_head, dropout)

    def pretrain(self,
                 df,
                 max_epochs=3,
                 lr=1e-4,
                 lr_weight={
                     'numerical': 0.33,
                     'categorical': 0.33,
                     'vector': 0.33
                 },
                 loss_clip=[0, 100],
                 n_sample=4,
                 mask_rate=0.15,
                 replace_rate=0.8,
                 batch_size=32,
                 shuffle=True,
                 num_workers=1):

        self.model.loss_clip = loss_clip
        self.logger.info("[-] Converting to indices")
        data = self.vocab.convert(df, num_workers)

        self.model.to(self.device)
        self.model.train()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=float(lr))

        self.logger.info("[-] Start Pretraining")

        process_bar = tqdm(range(max_epochs),
                           desc=f"[Progress]",
                           total=max_epochs,
                           leave=True,
                           position=0)

        for epoch in process_bar:

            generator = create_dataloader(data,
                                          self.col_type,
                                          self.vocab,
                                          self.embedding_dim,
                                          self.use_pos,
                                          batch_size,
                                          num_workers,
                                          mask_rate=mask_rate,
                                          replace_rate=replace_rate,
                                          n_sample=n_sample,
                                          shuffle=shuffle)

            metric_bar = tqdm([0],
                              desc=f"[Metric]",
                              bar_format="{desc} {postfix}",
                              leave=False,
                              position=2)

            epoch_bar = tqdm(generator,
                             desc=f"[Epoch]",
                             leave=False,
                             position=1)

            loss_history = {'numerical': [], 'categorical': [], 'vector': []}

            for batch_data in epoch_bar:

                batch_data = transfer(batch_data, self.device)
                _, losses = self.model.forward(batch_data, mode='train')

                loss = sum([
                    losses[data_type] / len(self.col_type[data_type]) *
                    lr_weight[data_type] for data_type in self.col_type
                    if len(self.col_type[data_type]) > 0
                ])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                display = ''
                for types in losses:
                    loss_history[types].append(losses[types].item())
                    display += f'{types}: {np.mean(loss_history[types]):5.2f} '
                metric_bar.set_postfix_str(display)

            process_bar.write(f'[Log] Epoch {epoch:0>2d}| ' + display)
            epoch_bar.close()
            metric_bar.close()

        process_bar.close()

        self.model.cpu()

    # def transform(self, df, batch_size=32, num_workers=1):
    #     self.logger.info("[-] Converting to indices")
    #     data = self.vocab.convert(df, num_workers)

    #     generator = create_dataloader(
    #         data, self.col_type, self.vocab,
    #         self.embedding_dim, self.use_pos,
    #         batch_size, num_workers, mode='test')

    #     self.logger.info("[-] Start Transforming")

    #     process_bar = tqdm(
    #         generator,
    #         desc=f"[Process]",
    #         leave=False,
    #         position=0)

    #     self.model.to(self.device)
    #     self.model.eval()

    #     df_t = []
    #     for batch_data in process_bar:
    #         batch_data = transfer(batch_data, self.device)
    #         feature = self.model.forward(batch_data, mode='test')
    #         df_t += list(feature.cpu().detach().numpy())

    #     process_bar.close()
    #     self.model.cpu()

    #     return df_t

    def save(self, model_path='model.ckpt', vocab_path='vocab.pkl'):
        torch.save(self.model.state_dict(), model_path)
        with open(vocab_path, 'wb') as file:
            pkl.dump(self.vocab, file)
示例#7
0
class Train:
    def __init__(self, model_name, corpus_dataset):
        self._config = TrainConfig()
        self._model_name = model_name
        self._data_loader = corpus_dataset.get_data_loader(
            self._config.batch_size)
        self._vocabulary = corpus_dataset.vocabulary
        self._model = Model(vocabulary=corpus_dataset.vocabulary,
                            training=True)
        # TODO: Support for other optimizers
        self._optimizer = optim.Adam(self._model.parameters(),
                                     lr=self._config.learning_rate)
        self._global_step = -1

        self._train_logger = logging.getLogger('Train')
        logging.basicConfig(level=logging.INFO)

    def train_step(self, input_seqs, input_lengths, target_seqs, masks):
        self._optimizer.zero_grad()
        step_loss, print_loss, _ = self._model(input_seqs, input_lengths,
                                               target_seqs, masks,
                                               self._global_step)

        self._train_logger.info('Step {}:  Training loss: {}'.format(
            self._global_step, print_loss))

        step_loss.backward()

        if self._config.use_gradient_clipping:
            _ = nn.utils.clip_grad_norm_(self._model.parameters(),
                                         self._config.gradient_clipping_value)

        self._optimizer.step()

    def train(self,
              num_steps,
              save_num_steps,
              save_folder='./data/models/train_dev'):

        if self._global_step < 0:
            self._global_step = 0
        elif self._global_step >= num_steps:
            logging.info(
                'Global step past number of steps requested. No training needed. Global Step = {}. '
                'Num training steps = {}'.format(self._global_step, num_steps))
            return

        stop_training = False

        while not stop_training:
            for input_seqs, input_lengths, target_seqs, masks in self._data_loader:
                self.train_step(input_seqs, input_lengths, target_seqs, masks)
                self._global_step += 1

                if self._global_step % save_num_steps == 0:
                    self.save_checkpoint(save_folder)
                    just_saved = True
                else:
                    just_saved = False

                if self._global_step >= num_steps:
                    stop_training = True
                    logging.info('Finished training at step {}'.format(
                        self._global_step))
                    if not just_saved:
                        self.save_checkpoint(save_folder)
                    break

    def save_checkpoint(self, save_folder):
        makedirs(save_folder, exist_ok=True)
        save_path = path.join(save_folder,
                              'checkpoint-{}.tar'.format(self._global_step))
        logging.info('Saving checkpoint at step {}'.format(self._global_step))
        torch.save(
            {
                'name': self._model_name,
                'global_step': self._global_step,
                'model': self._model.state_dict(),
                'optimizer': self._optimizer.state_dict(),
                'vocabulary': self._vocabulary.__dict__,
            }, save_path)
        logging.info('Checkpoint saved at {}'.format(save_path))

    @staticmethod
    def load_from_checkpoint(checkpoint_path, corpus_dataset):
        checkpoint = torch.load(checkpoint_path)
        train_obj = Train(checkpoint['name'], corpus_dataset)
        train_obj._vocabulary.__dict__ = checkpoint['vocabulary']
        train_obj._global_step = checkpoint['global_step']
        train_obj._model.load_state_dict(checkpoint['model'])
        train_obj._train_logger.info(
            'Restored from checkpoint {}'.format(checkpoint_path))
        return train_obj
示例#8
0
文件: train.py 项目: ChenX17/aligntts
def main(args):
    train_loader, val_loader, collate_fn = prepare_dataloaders(
        hparams, stage=args.stage)
    initial_iteration = None
    if args.stage != 0:
        checkpoint_path = f"training_log/aligntts/stage{args.stage-1}/checkpoint_{hparams.train_steps[args.stage-1]}"

        if not os.path.isfile(checkpoint_path):
            print(f'{checkpoint_path} does not exist')
            checkpoint_path = sorted(
                glob(f"training_log/aligntts/stage{args.stage-1}/checkpoint_*")
            )[-1]
            print(f'Loading {checkpoint_path} instead')

        state_dict = {}
        for k, v in torch.load(checkpoint_path)['state_dict'].items():
            state_dict[k[7:]] = v

        model = Model(hparams).cuda()
        model.load_state_dict(state_dict)
        model = nn.DataParallel(model).cuda()
    else:
        if args.pre_trained_model != '':
            if not os.path.isfile(args.pre_trained_model):
                print(f'{args.pre_trained_model} does not exist')

            state_dict = {}
            for k, v in torch.load(
                    args.pre_trained_model)['state_dict'].items():
                state_dict[k[7:]] = v
            initial_iteration = torch.load(args.pre_trained_model)['iteration']
            model = Model(hparams).cuda()
            model.load_state_dict(state_dict)
            model = nn.DataParallel(model).cuda()
        else:

            model = nn.DataParallel(Model(hparams)).cuda()

    criterion = MDNLoss()
    writer = get_writer(hparams.output_directory,
                        f'{hparams.log_directory}/stage{args.stage}')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=hparams.lr,
                                 betas=(0.9, 0.98),
                                 eps=1e-09)
    iteration, loss = 0, 0
    if initial_iteration is not None:
        iteration = initial_iteration
    model.train()

    print(f'Stage{args.stage} Start!!! ({str(datetime.now())})')
    while True:
        for i, batch in enumerate(train_loader):
            if args.stage == 0:
                text_padded, mel_padded, text_lengths, mel_lengths = [
                    reorder_batch(x, hparams.n_gpus).cuda() for x in batch
                ]
                align_padded = None
            else:
                text_padded, mel_padded, align_padded, text_lengths, mel_lengths = [
                    reorder_batch(x, hparams.n_gpus).cuda() for x in batch
                ]

            sub_loss = model(text_padded,
                             mel_padded,
                             align_padded,
                             text_lengths,
                             mel_lengths,
                             criterion,
                             stage=args.stage,
                             log_viterbi=args.log_viterbi,
                             cpu_viterbi=args.cpu_viterbi)
            sub_loss = sub_loss.mean() / hparams.accumulation
            sub_loss.backward()
            loss = loss + sub_loss.item()
            iteration += 1
            if iteration % 100 == 0:
                print(
                    f'[{str(datetime.now())}] Stage {args.stage} Iter {iteration:<6d} Loss {loss:<8.6f}'
                )

            if iteration % hparams.accumulation == 0:
                lr_scheduling(optimizer, iteration // hparams.accumulation)
                nn.utils.clip_grad_norm_(model.parameters(),
                                         hparams.grad_clip_thresh)
                optimizer.step()
                model.zero_grad()
                writer.add_scalar('Train loss', loss,
                                  iteration // hparams.accumulation)
                writer.add_scalar('Learning rate', get_lr(optimizer),
                                  iteration // hparams.accumulation)
                loss = 0

            if iteration % (hparams.iters_per_validation *
                            hparams.accumulation) == 0:
                validate(model, criterion, val_loader, iteration, writer,
                         args.stage)

            if iteration % (hparams.iters_per_checkpoint *
                            hparams.accumulation) == 0:
                save_checkpoint(
                    model,
                    optimizer,
                    hparams.lr,
                    iteration // hparams.accumulation,
                    filepath=
                    f'{hparams.output_directory}/{hparams.log_directory}/stage{args.stage}'
                )

            if iteration == (hparams.train_steps[args.stage] *
                             hparams.accumulation):
                break

        if iteration == (hparams.train_steps[args.stage] *
                         hparams.accumulation):
            break

    print(f'Stage{args.stage} End!!! ({str(datetime.now())})')