Пример #1
0
        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32))
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])

            output, _last_hidden, _, _last_state, = recurrent.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]
Пример #2
0
def create_model(args, queue):
    model = cnn.CNNModelHelper(name="LSTM_bench")
    seq_lengths, hidden_init, cell_init, target = \
        model.net.AddExternalInputs(
            'seq_lengths',
            'hidden_init',
            'cell_init',
            'target',
        )
    input_blob = model.DequeueBlobs(queue, "input_data")
    all_hidden, last_hidden, _, last_state = recurrent.LSTM(
        model=model,
        input_blob=input_blob,
        seq_lengths=seq_lengths,
        initial_states=(hidden_init, cell_init),
        dim_in=args.input_dim,
        dim_out=args.hidden_dim,
        scope="lstm1",
    )

    model.AddGradientOperators([all_hidden])

    # carry states over
    model.net.Copy(last_hidden, hidden_init)
    model.net.Copy(last_hidden, cell_init)

    workspace.FeedBlob(
        hidden_init,
        np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32))
    workspace.FeedBlob(
        cell_init,
        np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32))
    return model
Пример #3
0
 def create_lstm(
         model, input_blob, seq_lengths,
         init, dim_in, dim_out, scope):
     recurrent.LSTM(
         model, input_blob, seq_lengths, init,
         dim_in, dim_out, scope="external/recurrent",
         outputs_with_grads=outputs_with_grads)
Пример #4
0
def create_model(args, queue, label_queue, input_shape):
    model = cnn.CNNModelHelper(name="LSTM_bench")
    seq_lengths, hidden_init, cell_init, target = \
        model.net.AddExternalInputs(
            'seq_lengths',
            'hidden_init',
            'cell_init',
            'target',
        )
    input_blob = model.DequeueBlobs(queue, "input_data")
    labels = model.DequeueBlobs(label_queue, "label")

    if args.implementation == "own":
        output, last_hidden, _, last_state = recurrent.LSTM(
            model=model,
            input_blob=input_blob,
            seq_lengths=seq_lengths,
            initial_states=(hidden_init, cell_init),
            dim_in=args.input_dim,
            dim_out=args.hidden_dim,
            scope="lstm1",
            memory_optimization=args.memory_optimization,
        )
    elif args.implementation == "cudnn":
        # We need to feed a placeholder input so that RecurrentInitOp
        # can infer the dimensions.
        model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
        output, last_hidden, _ = recurrent.cudnn_LSTM(
            model=model,
            input_blob=input_blob,
            initial_states=(hidden_init, cell_init),
            dim_in=args.input_dim,
            dim_out=args.hidden_dim,
            scope="cudnnlstm",
        )

    else:
        assert False, "Unknown implementation"

    weights = model.UniformFill(labels, "weights")
    softmax, loss = model.SoftmaxWithLoss(
        [model.Flatten(output), labels, weights],
        ['softmax', 'loss'],
    )

    model.AddGradientOperators([loss])

    # carry states over
    model.net.Copy(last_hidden, hidden_init)
    model.net.Copy(last_hidden, cell_init)

    workspace.FeedBlob(
        hidden_init,
        np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32))
    workspace.FeedBlob(
        cell_init,
        np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32))
    return model, output
Пример #5
0
def rnn_unidirectional_encoder(model, embedded_inputs, input_lengths,
                               initial_hidden_state, initial_cell_state,
                               embedding_size, encoder_num_units,
                               use_attention):
    """ Unidirectional (forward pass) LSTM encoder."""

    outputs, final_hidden_state, _, final_cell_state = recurrent.LSTM(
        model=model,
        input_blob=embedded_inputs,
        seq_lengths=input_lengths,
        initial_states=(initial_hidden_state, initial_cell_state),
        dim_in=embedding_size,
        dim_out=encoder_num_units,
        scope='encoder',
        outputs_with_grads=([0] if use_attention else [1, 3]),
    )
    return outputs, final_hidden_state, final_cell_state
Пример #6
0
def rnn_bidirectional_encoder(
    model,
    embedded_inputs,
    input_lengths,
    initial_hidden_state,
    initial_cell_state,
    embedding_size,
    encoder_num_units,
    use_attention
):
    """ Bidirectional (forward pass and backward pass) LSTM encoder."""

    # Forward pass
    (
        outputs_fw,
        final_hidden_state_fw,
        _,
        final_cell_state_fw,
    ) = recurrent.LSTM(
        model=model,
        input_blob=embedded_inputs,
        seq_lengths=input_lengths,
        initial_states=(initial_hidden_state, initial_cell_state),
        dim_in=embedding_size,
        dim_out=encoder_num_units,
        scope='forward_encoder',
        outputs_with_grads=([0] if use_attention else [1, 3]),
    )

    # Backward pass
    reversed_embedded_inputs = model.net.ReversePackedSegs(
        [embedded_inputs, input_lengths],
        ['reversed_embedded_inputs'],
    )

    (
        outputs_bw,
        final_hidden_state_bw,
        _,
        final_cell_state_bw,
    ) = recurrent.LSTM(
        model=model,
        input_blob=reversed_embedded_inputs,
        seq_lengths=input_lengths,
        initial_states=(initial_hidden_state, initial_cell_state),
        dim_in=embedding_size,
        dim_out=encoder_num_units,
        scope='backward_encoder',
        outputs_with_grads=([0] if use_attention else [1, 3]),
    )

    outputs_bw = model.net.ReversePackedSegs(
        [outputs_bw, input_lengths],
        ['outputs_bw'],
    )

    # Concatenate forward and backward results
    outputs, _ = model.net.Concat(
        [outputs_fw, outputs_bw],
        ['outputs', 'outputs_dim'],
        axis=2,
    )

    final_hidden_state, _ = model.net.Concat(
        [final_hidden_state_fw, final_hidden_state_bw],
        ['final_hidden_state', 'final_hidden_state_dim'],
        axis=2,
    )

    final_cell_state, _ = model.net.Concat(
        [final_cell_state_fw, final_cell_state_bw],
        ['final_cell_state', 'final_cell_state_dim'],
        axis=2,
    )
    return outputs, final_hidden_state, final_cell_state
Пример #7
0
    def testEqualToCudnn(self):
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA)):
            T = 8
            batch_size = 4
            input_dim = 8
            hidden_dim = 31

            workspace.FeedBlob("seq_lengths",
                               np.array([T] * batch_size, dtype=np.int32))
            workspace.FeedBlob(
                "target",
                np.zeros([T, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "hidden_init",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "cell_init",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))

            own_model = cnn.CNNModelHelper(name="own_lstm")

            input_shape = [T, batch_size, input_dim]
            cudnn_model = cnn.CNNModelHelper(name="cudnn_lstm")
            input_blob = cudnn_model.param_init_net.UniformFill(
                [], "input", shape=input_shape)
            workspace.FeedBlob(
                "CUDNN/hidden_init_cudnn",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))
            workspace.FeedBlob(
                "CUDNN/cell_init_cudnn",
                np.zeros([1, batch_size, hidden_dim], dtype=np.float32))

            cudnn_output, cudnn_last_hidden, _, param_extract = recurrent.cudnn_LSTM(
                model=cudnn_model,
                input_blob=input_blob,
                initial_states=("hidden_init_cudnn", "hidden_init_cudnn"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="CUDNN",
                return_params=True,
            )
            cudnn_loss = cudnn_model.AveragedLoss(
                cudnn_model.SquaredL2Distance([cudnn_output, "target"],
                                              "CUDNN/dist"), "CUDNN/loss")

            own_output, own_last_hidden, _, last_state, own_params = recurrent.LSTM(
                model=own_model,
                input_blob=input_blob,
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=input_dim,
                dim_out=hidden_dim,
                scope="OWN",
                return_params=True,
            )
            own_loss = own_model.AveragedLoss(
                own_model.SquaredL2Distance([own_output, "target"],
                                            "OWN/dist"), "OWN/loss")

            # Add gradients
            cudnn_model.AddGradientOperators([cudnn_loss])
            own_model.AddGradientOperators([own_loss])

            # Add parameter updates
            LR = cudnn_model.param_init_net.ConstantFill([],
                                                         shape=[1],
                                                         value=0.01)
            ONE = cudnn_model.param_init_net.ConstantFill([],
                                                          shape=[1],
                                                          value=1.0)
            for param in cudnn_model.GetParams():
                cudnn_model.WeightedSum(
                    [param, ONE, cudnn_model.param_to_grad[param], LR], param)
            for param in own_model.GetParams():
                own_model.WeightedSum(
                    [param, ONE, own_model.param_to_grad[param], LR], param)

            workspace.RunNetOnce(cudnn_model.param_init_net)
            workspace.CreateNet(cudnn_model.net)

            ##
            ##  CUDNN LSTM MODEL EXECUTION
            ##
            # Get initial values from CuDNN LSTM so we can feed them
            # to our own.
            (param_extract_net, param_extract_mapping) = param_extract
            workspace.RunNetOnce(param_extract_net)
            cudnn_lstm_params = {}
            for input_type, pars in param_extract_mapping.items():
                cudnn_lstm_params[input_type] = {}
                for k, v in pars.items():
                    cudnn_lstm_params[input_type][k] = workspace.FetchBlob(
                        v[0])

            # Run the model 3 times, so that some parameter updates are done
            workspace.RunNet(cudnn_model.net.Proto().name, 3)

            ##
            ## OWN LSTM MODEL EXECUTION
            ##
            # Map the cuDNN parameters to our own
            workspace.RunNetOnce(own_model.param_init_net)
            recurrent.InitFromLSTMParams(own_params, cudnn_lstm_params)

            # Run the model 3 times, so that some parameter updates are done
            workspace.CreateNet(own_model.net)
            workspace.RunNet(own_model.net.Proto().name, 3)

            ##
            ## COMPARE RESULTS
            ##
            # Then compare that final results after 3 runs are equal
            own_output_data = workspace.FetchBlob(own_output)
            own_last_hidden = workspace.FetchBlob(own_last_hidden)
            own_loss = workspace.FetchBlob(own_loss)

            cudnn_output_data = workspace.FetchBlob(cudnn_output)
            cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden)
            cudnn_loss = workspace.FetchBlob(cudnn_loss)

            self.assertTrue(np.allclose(own_output_data, cudnn_output_data))
            self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden))
            self.assertTrue(np.allclose(own_loss, cudnn_loss))
Пример #8
0
    def _build_model(
        self,
        init_params,
    ):
        model = seq2seq_util.ModelHelper(init_params=init_params, )

        self.encoder_inputs = model.net.AddExternalInput('encoder_inputs')
        self.encoder_lengths = model.net.AddExternalInput('encoder_lengths')
        self.decoder_inputs = model.net.AddExternalInput('decoder_inputs')
        self.decoder_lengths = model.net.AddExternalInput('decoder_lengths')
        self.targets = model.net.AddExternalInput('targets')
        self.target_weights = model.net.AddExternalInput('target_weights')

        optimizer_params = self.model_params['optimizer_params']
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        self.learning_rate = model.AddParam(
            name='learning_rate',
            init_value=float(optimizer_params['learning_rate']),
            trainable=False,
        )
        self.global_step = model.AddParam(
            name='global_step',
            init_value=0,
            trainable=False,
        )
        self.start_time = model.AddParam(
            name='start_time',
            init_value=time.time(),
            trainable=False,
        )

        assert self.num_gpus < 2
        assert len(self.encoder_params['encoder_layer_configs']) == 1
        assert len(self.model_params['decoder_layer_configs']) == 1

        encoder_num_units = (
            self.encoder_params['encoder_layer_configs'][0]['num_units'])
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])

        (
            encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
        ) = self._embedding_encoder(
            model=model,
            encoder_type=self.encoder_type,
            encoder_params=self.encoder_params,
            inputs=self.encoder_inputs,
            input_lengths=self.encoder_lengths,
            vocab_size=self.source_vocab_size,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
        )

        # For bidirectional RNN, the num of units doubles after encodeing
        if (self.encoder_type == 'rnn'
                and self.encoder_params['use_bidirectional_encoder']):
            encoder_num_units *= 2

        if attention_type == 'none':
            decoder_initial_hidden_state = model.FC(
                final_encoder_hidden_state,
                'decoder_initial_hidden_state',
                encoder_num_units,
                decoder_num_units,
                axis=2,
            )
            decoder_initial_cell_state = model.FC(
                final_encoder_cell_state,
                'decoder_initial_cell_state',
                encoder_num_units,
                decoder_num_units,
                axis=2,
            )
        else:
            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_hidden_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            decoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_cell_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            initial_attention_weighted_encoder_context = (
                model.param_init_net.ConstantFill(
                    [],
                    'initial_attention_weighted_encoder_context',
                    shape=[encoder_num_units],
                    value=0.0,
                ))

        sqrt3 = math.sqrt(3)
        decoder_embeddings = model.AddParam(
            name='decoder_embeddings',
            init=('UniformFill',
                  dict(
                      shape=[
                          self.target_vocab_size,
                          self.model_params['decoder_embedding_size'],
                      ],
                      min=-sqrt3,
                      max=sqrt3,
                  )),
        )

        embedded_decoder_inputs = model.net.Gather(
            [decoder_embeddings, self.decoder_inputs],
            ['embedded_decoder_inputs'],
        )
        # seq_len x batch_size x decoder_embedding_size
        with core.NameScope('', reset=True):
            if attention_type == 'none':
                decoder_outputs, _, _, _ = recurrent.LSTM(
                    model=model,
                    input_blob=embedded_decoder_inputs,
                    seq_lengths=self.decoder_lengths,
                    initial_states=(
                        decoder_initial_hidden_state,
                        decoder_initial_cell_state,
                    ),
                    dim_in=self.model_params['decoder_embedding_size'],
                    dim_out=decoder_num_units,
                    scope='decoder',
                    outputs_with_grads=[0],
                )
                decoder_output_size = decoder_num_units
            else:
                (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
                 _) = recurrent.LSTMWithAttention(
                     model=model,
                     decoder_inputs=embedded_decoder_inputs,
                     decoder_input_lengths=self.decoder_lengths,
                     initial_decoder_hidden_state=decoder_initial_hidden_state,
                     initial_decoder_cell_state=decoder_initial_cell_state,
                     initial_attention_weighted_encoder_context=(
                         initial_attention_weighted_encoder_context),
                     encoder_output_dim=encoder_num_units,
                     encoder_outputs=encoder_outputs,
                     decoder_input_dim=self.
                     model_params['decoder_embedding_size'],
                     decoder_state_dim=decoder_num_units,
                     scope='decoder',
                     outputs_with_grads=[0, 4],
                 )
                decoder_outputs, _ = model.net.Concat(
                    [decoder_outputs, attention_weighted_encoder_contexts],
                    [
                        'states_and_context_combination',
                        '_states_and_context_combination_concat_dims',
                    ],
                    axis=2,
                )
                decoder_output_size = decoder_num_units + encoder_num_units

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = self.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [self.targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [self.target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )

        output_probs, loss_per_word = model.net.SoftmaxWithLoss(
            [output_logits, targets, target_weights],
            ['OutputProbs', 'loss_per_word'],
        )

        num_words = model.net.ReduceFrontSum(
            target_weights,
            'num_words',
        )
        self.total_loss_scalar = model.net.Mul(
            [loss_per_word, num_words],
            'total_loss_scalar',
        )
        self.forward_net = model.net.Clone(name=model.net.Name() +
                                           '_forward_only', )
        # print loss only in the forward net which evaluates loss after every
        # epoch
        self.forward_net.Print([self.total_loss_scalar], [])

        # Note: average over batch.
        # It is tricky because of two problems:
        # 1. ReduceFrontSum from 1-D tensor returns 0-D tensor
        # 2. If you want to multiply 0-D by 1-D tensor
        # (by scalar batch_size_inverse_tensor),
        # you need to use broadcasting. But gradient propogation
        # is broken for op with broadcasting.
        # total_loss_scalar, _ = model.net.Reshape(
        #     [total_loss_scalar],
        #     [total_loss_scalar, 'total_loss_scalar_old_shape'],
        #     shape=[1],
        # )
        batch_size_inverse_tensor = (model.param_init_net.ConstantFill(
            [],
            'batch_size_tensor',
            shape=[],
            value=1.0 / self.batch_size,
        ))
        total_loss_scalar_average = model.net.Mul(
            [self.total_loss_scalar, batch_size_inverse_tensor],
            ['total_loss_scalar_average'],
        )

        model.AddGradientOperators([
            total_loss_scalar_average,
        ])
        ONE = model.param_init_net.ConstantFill(
            [],
            'ONE',
            shape=[1],
            value=1.0,
        )
        logger.info('All trainable variables: ')

        for param in model.params:
            param_grad = model.param_to_grad[param]
            if param in model.param_to_grad:
                if isinstance(param_grad, core.GradientSlice):
                    param_grad_values = param_grad.values
                    param_grad_values = model.net.Clip(
                        [param_grad_values],
                        [param_grad_values],
                        min=0.0,
                        max=float(self.model_params['max_grad_value']),
                    )
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad_values,
                            model.net.Negative(
                                [self.learning_rate],
                                'negative_learning_rate',
                            ),
                        ],
                        param,
                    )
                else:
                    param_grad = model.net.Clip(
                        [param_grad],
                        [param_grad],
                        min=0.0,
                        max=float(self.model_params['max_grad_value']),
                    )
                    model.net.WeightedSum(
                        [
                            param,
                            ONE,
                            param_grad,
                            model.net.Negative(
                                [self.learning_rate],
                                'negative_learning_rate',
                            ),
                        ],
                        param,
                    )
        self.model = model
Пример #9
0
    def test_lstm(self, t, n, d):
        model = ModelHelperBase(name='external')

        input_blob, seq_lengths, hidden_init, cell_init = (
            model.net.AddExternalInputs('input_blob', 'seq_lengths',
                                        'hidden_init', 'cell_init'))

        recurrent.LSTM(model,
                       input_blob,
                       seq_lengths, (hidden_init, cell_init),
                       d,
                       d,
                       scope="external/recurrent")

        op = model.net._net.op[-1]

        def extract_param_name(model, param_substr):
            result = []
            for p in model.params:
                if param_substr in str(p):
                    result.append(str(p))

            assert len(result) == 1
            return result[0]

        gates = {
            gate: extract_param_name(model, gate)
            for gate in ["gates_t_b", "gates_t_w"]
        }
        workspace.RunNetOnce(model.param_init_net)

        def reference(input, hidden_input, cell_input, gates_w, gates_b,
                      seq_lengths):
            T = input.shape[0]
            N = input.shape[1]
            G = input.shape[2]
            D = hidden_input.shape[2]
            hidden = np.zeros(shape=(T + 1, N, D))
            cell = np.zeros(shape=(T + 1, N, D))
            assert hidden.shape[0] == T + 1
            assert cell.shape[0] == T + 1
            assert hidden.shape[1] == N
            assert cell.shape[1] == N
            cell[0, :, :] = cell_input
            hidden[0, :, :] = hidden_input
            for t in range(T):
                timestep = np.asarray([t]).astype(np.int32)
                input_t = input[t].reshape(1, N, G)
                hidden_t_prev = hidden[t].reshape(1, N, D)
                cell_t_prev = cell[t].reshape(1, N, D)
                gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
                gates = gates + input_t
                hidden_t, cell_t = lstm_unit(cell_t_prev, gates, seq_lengths,
                                             timestep)
                hidden[t + 1] = hidden_t
                cell[t + 1] = cell_t
            return (hidden[1:], hidden[-1].reshape(1, N, D), cell[1:],
                    cell[-1].reshape(1, N, D))

        input_blob = op.input[0]

        workspace.FeedBlob(str(input_blob),
                           np.random.randn(t, n, d * 4).astype(np.float32))
        workspace.FeedBlob("hidden_init",
                           np.random.randn(1, n, d).astype(np.float32))
        workspace.FeedBlob("cell_init",
                           np.random.randn(1, n, d).astype(np.float32))
        workspace.FeedBlob(
            "seq_lengths",
            np.random.randint(0, t, size=(n, )).astype(np.int32))

        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            [
                workspace.FetchBlob(name) for name in [
                    input_blob, "hidden_init", "cell_init", gates["gates_t_w"],
                    gates["gates_t_b"], "seq_lengths"
                ]
            ],
            reference,
        )

        # Checking for input, gates_t_w and gates_t_b gradients
        for param in [0, 3, 4]:
            self.assertGradientChecks(
                hu.cpu_do,
                op,
                [
                    workspace.FetchBlob(name) for name in [
                        input_blob, "hidden_init", "cell_init",
                        gates["gates_t_w"], gates["gates_t_b"], "seq_lengths"
                    ]
                ],
                param,
                [0],
                threshold=0.01,
            )
Пример #10
0
    def model_build_fun(self, model, forward_only=False, loss_scale=None):
        encoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_inputs', )
        encoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_lengths', )
        decoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_inputs', )
        decoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_lengths', )
        targets = model.net.AddExternalInput(
            workspace.GetNameScope() + 'targets', )
        target_weights = model.net.AddExternalInput(
            workspace.GetNameScope() + 'target_weights', )
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        ) = self._build_embedding_encoder(
            model=model,
            inputs=encoder_inputs,
            input_lengths=encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=self.encoder_embeddings,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
            num_gpus=self.num_gpus,
            forward_only=forward_only,
        )

        assert len(self.model_params['decoder_layer_configs']) == 1
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])

        if attention_type == 'none':
            decoder_initial_hidden_state = model.FC(
                final_encoder_hidden_state,
                'decoder_initial_hidden_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
            decoder_initial_cell_state = model.FC(
                final_encoder_cell_state,
                'decoder_initial_cell_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
        else:
            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_hidden_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            decoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_cell_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            initial_attention_weighted_encoder_context = (
                model.param_init_net.ConstantFill(
                    [],
                    'initial_attention_weighted_encoder_context',
                    shape=[encoder_output_dim],
                    value=0.0,
                ))

        if self.num_gpus == 0:
            embedded_decoder_inputs = model.net.Gather(
                [self.decoder_embeddings, decoder_inputs],
                ['embedded_decoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_decoder_inputs_cpu = model.net.Gather(
                    [self.decoder_embeddings, decoder_inputs],
                    ['embedded_decoder_inputs_cpu'],
                )
            embedded_decoder_inputs = model.CopyCPUToGPU(
                embedded_decoder_inputs_cpu,
                'embedded_decoder_inputs',
            )

        # seq_len x batch_size x decoder_embedding_size
        if attention_type == 'none':
            decoder_outputs, _, _, _ = recurrent.LSTM(
                model=model,
                input_blob=embedded_decoder_inputs,
                seq_lengths=decoder_lengths,
                initial_states=(
                    decoder_initial_hidden_state,
                    decoder_initial_cell_state,
                ),
                dim_in=self.model_params['decoder_embedding_size'],
                dim_out=decoder_num_units,
                scope='decoder',
                outputs_with_grads=[0],
            )
            decoder_output_size = decoder_num_units
        else:
            (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
             _) = recurrent.LSTMWithAttention(
                 model=model,
                 decoder_inputs=embedded_decoder_inputs,
                 decoder_input_lengths=decoder_lengths,
                 initial_decoder_hidden_state=decoder_initial_hidden_state,
                 initial_decoder_cell_state=decoder_initial_cell_state,
                 initial_attention_weighted_encoder_context=(
                     initial_attention_weighted_encoder_context),
                 encoder_output_dim=encoder_output_dim,
                 encoder_outputs=encoder_outputs,
                 decoder_input_dim=self.model_params['decoder_embedding_size'],
                 decoder_state_dim=decoder_num_units,
                 scope='decoder',
                 outputs_with_grads=[0, 4],
             )
            decoder_outputs, _ = model.net.Concat(
                [decoder_outputs, attention_weighted_encoder_contexts],
                [
                    'states_and_context_combination',
                    '_states_and_context_combination_concat_dims',
                ],
                axis=2,
            )
            decoder_output_size = decoder_num_units + encoder_output_dim

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = self.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )
        output_probs = model.net.Softmax(
            [output_logits],
            ['output_probs'],
            engine=('CUDNN' if self.num_gpus > 0 else None),
        )
        label_cross_entropy = model.net.LabelCrossEntropy(
            [output_probs, targets],
            ['label_cross_entropy'],
        )
        weighted_label_cross_entropy = model.net.Mul(
            [label_cross_entropy, target_weights],
            'weighted_label_cross_entropy',
        )
        total_loss_scalar = model.net.SumElements(
            [weighted_label_cross_entropy],
            'total_loss_scalar',
        )
        total_loss_scalar_weighted = model.net.Scale(
            [total_loss_scalar],
            'total_loss_scalar_weighted',
            scale=1.0 / self.batch_size,
        )
        return [total_loss_scalar_weighted]