Exemplo n.º 1
0
    def train(engine, mini_batch):
        for language_model, model, optimizer in zip(engine.language_models,
                                                    engine.models,
                                                    engine.optimizers):
            language_model.eval()
            model.train()
            if engine.state.iteration % engine.config.iteration_per_update == 1 or \
                engine.config.iteration_per_update == 1:
                if engine.state.iteration > 1:
                    optimizer.zero_grad()

        device = next(engine.models[0].parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device),
                          mini_batch.src[1].to(device))
        mini_batch.tgt = (mini_batch.tgt[0].to(device),
                          mini_batch.tgt[1].to(device))

        with autocast(not engine.config.off_autocast):
            # X2Y
            x, y = (mini_batch.src[0][:, 1:-1],
                    mini_batch.src[1] - 2), mini_batch.tgt[0][:, :-1]
            x_hat_lm, y_hat_lm = None, None
            # |x| = (batch_size, n)
            # |y| = (batch_size, m)
            y_hat = engine.models[X2Y](x, y)
            # |y_hat| = (batch_size, m, y_vocab_size)

            if engine.state.epoch > engine.config.dsl_n_warmup_epochs:
                with torch.no_grad():
                    y_hat_lm = engine.language_models[X2Y](y)
                    # |y_hat_lm| = |y_hat|

            #Y2X
            # Since encoder in seq2seq takes packed_sequence instance,
            # we need to re-sort if we use reversed src and tgt.
            x, y, restore_indice = DualSupervisedTrainingEngine._reorder(
                mini_batch.src[0][:, :-1],
                mini_batch.tgt[0][:, 1:-1],
                mini_batch.tgt[1] - 2,
            )
            # |x| = (batch_size, n)
            # |y| = (batch_size, m)
            x_hat = DualSupervisedTrainingEngine._restore_order(
                engine.models[Y2X](y, x),
                restore_indice=restore_indice,
            )
            # |x_hat| = (batch_size, n, x_vocab_size)

            if engine.state.epoch > engine.config.dsl_n_warmup_epochs:
                with torch.no_grad():
                    x_hat_lm = DualSupervisedTrainingEngine._restore_order(
                        engine.language_models[Y2X](x),
                        restore_indice=restore_indice,
                    )
                    # |x_hat_lm| = |x_hat|

            x, y = mini_batch.src[0][:, 1:], mini_batch.tgt[0][:, 1:]
            loss_x2y, loss_y2x, dual_loss = DualSupervisedTrainingEngine._get_loss(
                x,
                y,
                x_hat,
                y_hat,
                engine.crits,
                x_hat_lm,
                y_hat_lm,
                # According to the paper, DSL should be warm-started.
                # Thus, we turn-off the regularization at the beginning.
                lagrange=engine.config.dsl_lambda if
                engine.state.epoch > engine.config.dsl_n_warmup_epochs else .0)

            backward_targets = [
                loss_x2y.div(y.size(0)).div(
                    engine.config.iteration_per_update),
                loss_y2x.div(x.size(0)).div(
                    engine.config.iteration_per_update),
            ]

        for scaler, backward_target in zip(engine.scalers, backward_targets):
            if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
                scaler.scale(backward_target).backward()
            else:
                backward_target.backward()

        x_word_count = int(mini_batch.src[1].sum())
        y_word_count = int(mini_batch.tgt[1].sum())
        p_norm = float(
            get_parameter_norm(
                list(engine.models[X2Y].parameters()) +
                list(engine.models[Y2X].parameters())))
        g_norm = float(
            get_grad_norm(
                list(engine.models[X2Y].parameters()) +
                list(engine.models[Y2X].parameters())))

        if engine.state.iteration % engine.config.iteration_per_update == 0 and \
            engine.state.iteration > 0:
            for model, optimizer, scaler in zip(engine.models,
                                                engine.optimizers,
                                                engine.scalers):
                torch_utils.clip_grad_norm_(
                    model.parameters(),
                    engine.config.max_grad_norm,
                )
                # Take a step of gradient descent.
                if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
                    # Use scaler instead of engine.optimizer.step()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

        return {
            'x2y':
            float(loss_x2y / y_word_count),
            'y2x':
            float(loss_y2x / x_word_count),
            'reg':
            float(dual_loss / x.size(0)),
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Exemplo n.º 2
0
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        if engine.state.iteration % engine.config.iteration_per_update == 1:
            engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # Raw target variable has both BOS and EOS token.
        # The output of sequence-to-sequence does not have BOS token.
        # Thus, remove BOS token for reference.
        x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        with autocast():
            # Take feed-forward
            # Similar as before, the input of decoder does not have EOS token.
            # Thus, remove EOS token for decoder input.
            y_hat = engine.model(x, mini_batch.tgt[0][:, :-1])
            # |y_hat| = (batch_size, length, output_size)

            loss = engine.crit(y_hat.contiguous().view(-1, y_hat.size(-1)),
                               y.contiguous().view(-1))
            backward_target = loss.div(y.size(0)).div(
                engine.config.iteration_per_update)

        engine.scaler.scale(backward_target).backward()

        word_count = int(mini_batch.tgt[1].sum())
        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        if engine.state.iteration % engine.config.iteration_per_update == 0:
            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(
                engine.model.parameters(),
                engine.config.max_grad_norm,
            )
            # Take a step of gradient descent.
            # Use scaler instead of engine.optimizer.step()
            engine.scaler.step(engine.optimizer)
            engine.scaler.update()

            if engine.config.use_noam_decay and engine.lr_scheduler is not None:
                engine.lr_scheduler.step()

        loss = float(loss / word_count)
        ppl = np.exp(loss)

        return {
            'loss':
            loss,
            'ppl':
            ppl,
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Exemplo n.º 3
0
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # if 'is_src_target' is true, the trainer would train language model for source language.
        # For dsl case, both x and y has BOS and EOS tokens.
        # Thus, we need to remove BOS and EOS before the training.
        x = mini_batch.src[
            0][:, :-1] if engine.is_src_target else mini_batch.tgt[0][:, :-1]
        y = mini_batch.src[
            0][:, 1:] if engine.is_src_target else mini_batch.tgt[0][:, 1:]
        # |x| = |y| = (batch_size, length)

        with autocast(not engine.config.off_autocast):
            y_hat = engine.model(x)
            # |y_hat| = (batch_size, length, output_size)

            loss = engine.crit(
                y_hat.contiguous().view(-1, y_hat.size(-1)),
                y.contiguous().view(-1),
            ).sum()
            backward_target = loss.div(y.size(0))

        if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
            engine.scaler.scale(backward_target).backward()
        else:
            backward_target.backward()

        word_count = int(
            mini_batch.src[1].sum()) if engine.is_src_target else int(
                mini_batch.tgt[1].sum())
        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        # In orther to avoid gradient exploding, we apply gradient clipping.
        torch_utils.clip_grad_norm_(
            engine.model.parameters(),
            engine.config.max_grad_norm,
        )
        # Take a step of gradient descent.
        if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
            # Use scaler instead of engine.optimizer.step() if using GPU.
            engine.scaler.step(engine.optimizer)
            engine.scaler.update()
        else:
            engine.optimizer.step()

        loss = float(loss / word_count)
        ppl = np.exp(loss)

        return {
            'loss':
            loss,
            'ppl':
            ppl,
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Exemplo n.º 4
0
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        if engine.state.iteration % engine.config.iteration_per_update == 1 or \
            engine.config.iteration_per_update == 1:
            engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # Raw target variable has both BOS and EOS token.
        # The output of sequence-to-sequence does not have BOS token.
        # Thus, remove BOS token for reference.
        x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        # Take sampling process because set False for is_greedy.
        y_hat, indice = engine.model.search(
            x, is_greedy=False, max_length=engine.config.max_length)

        with torch.no_grad():
            # Based on the result of sampling, get reward.
            actor_reward = MinimumRiskTrainingEngine._get_reward(
                indice, y, n_gram=engine.config.rl_n_gram)
            # |y_hat| = (batch_size, length, output_size)
            # |indice| = (batch_size, length)
            # |actor_reward| = (batch_size)

            # Take samples as many as n_samples, and get average rewards for them.
            # I figured out that n_samples = 1 would be enough.
            baseline = []

            for _ in range(engine.config.rl_n_samples):
                _, sampled_indice = engine.model.search(
                    x,
                    is_greedy=False,
                    max_length=engine.config.max_length,
                )
                baseline += [
                    MinimumRiskTrainingEngine._get_reward(
                        sampled_indice,
                        y,
                        n_gram=engine.config.rl_n_gram,
                    )
                ]

            baseline = torch.stack(baseline).mean(dim=0)
            # |baseline| = (n_samples, batch_size) --> (batch_size)

            # Now, we have relatively expected cumulative reward.
            # Which score can be drawn from actor_reward subtracted by baseline.
            risk = (-actor_reward) - (-baseline)
            # |risk| = (batch_size)

        # calculate gradients with back-propagation
        loss = MinimumRiskTrainingEngine._get_loss(y_hat, indice, risk=risk)
        backward_target = loss.div(y.size(0)).div(
            engine.config.iteration_per_update)
        backward_target.backward()

        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        if engine.state.iteration % engine.config.iteration_per_update == 0:
            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(
                engine.model.parameters(),
                engine.config.max_grad_norm,
            )
            # Take a step of gradient descent.
            engine.optimizer.step()

        return {
            'actor':
            float(actor_reward.mean()),
            'baseline':
            float(baseline.mean()),
            'risk':
            float(risk.mean()),
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Exemplo n.º 5
0
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        if engine.state.iteration % engine.config.iteration_per_update == 1 or \
            engine.config.iteration_per_update == 1:
            if engine.state.iteration > 1:
                engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # Raw target variable has both BOS and EOS token. 
        # The output of sequence-to-sequence does not have BOS token. 
        # Thus, remove BOS token for reference.
        x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        # autocast로 공간효율적으로 학습 실행
        with autocast(not engine.config.off_autocast):
            # with autocast(not engine.config.off_autocast):
            # y_hat = (batch_size, length_m, output_size)
            # 입력 tgt의 경우, 맨뒤에 EOS를 토큰을 제거
            y_hat = engine.model(x, mini_batch.tgt[0][:, :-1])
            # |y_hat| = (batch_size, length, output_size)
            
            '''
            loss값 연산을 위해 다음과 같이 텐서 모양 정리
            모든 문장의 각 단어를 순서대로 배치했다고 보면됨
            변경 전(3D):
                y_hat = (batch_size, length_m, output_size)
                y = (batch_size, length_m)
            변경 후(2D):
                y_hat = (batch_size * length_m, output_size)
                y = (batch_size * length_m)
            '''
            loss = engine.crit(
                y_hat.contiguous().view(-1, y_hat.size(-1)),
                y.contiguous().view(-1)
            )
            '''
            div(y.size(0)): loss를 구한후, batch_size만큼 나눠준 후
            div(engine.config.iteration_per_update): 
            Gradient Accumulation을 위해 미리 나눠줌
            즉, backward_target이 진짜 적용시킬 loss 값이라 보면 됨
            '''
            backward_target = loss.div(y.size(0)).div(engine.config.iteration_per_update)

        if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
            engine.scaler.scale(backward_target).backward()
        else:
            backward_target.backward()

        word_count = int(mini_batch.tgt[1].sum())
        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        if engine.state.iteration % engine.config.iteration_per_update == 0 and \
            engine.state.iteration > 0:
            '''
            Gradient Clipping
            시퀸스의 time_step이 길수록, gradient가 매우 커질수도 있음
            g_norm이 너무 커서 많이 움직이는 걸 막기 위해 사용
            - 단, Adam을 쓰면 큰 필요는 없다고 함 ㅇㅇ
            '''
            torch_utils.clip_grad_norm_(
                engine.model.parameters(),
                engine.config.max_grad_norm,
            )
            # Take a step of gradient descent.
            if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
                # GPU를 사용할 경우, 기존 optim.step() 대신에 scaler로 step 수행
                engine.scaler.step(engine.optimizer)
                engine.scaler.update()
            else:
                engine.optimizer.step()

        loss = float(loss / word_count)
        ppl = np.exp(loss)

        return {
            'loss': loss,
            'ppl': ppl,
            '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }