def test_crf_with_loss_op(self, num_tags, num_words): model = CNNModelHelper(name='external') embeddings_dim = 200 embeddings = np.random.randn(num_words, embeddings_dim).astype(np.float32) transitions = np.random.uniform(low=-1, high=1, size=(num_tags + 2, num_tags + 2)).astype(np.float32) labels = np.random.randint(num_tags, size=(num_words)).astype(np.int64) embeddings_blob, labels_blob, transitions_blob = ( model.net.AddExternalInputs('embeddings_blob', 'labels_blob', 'crf_transitions')) workspace.FeedBlob(str(embeddings_blob), embeddings) workspace.FeedBlob(str(labels_blob), labels) workspace.FeedBlob(str(transitions_blob), transitions) predictions_blob = model.FC(embeddings_blob, "fc_0", embeddings_dim, num_tags, ('UniformFill', { 'min': -1.0 }, { 'max': 1.0 }), ('UniformFill', { 'min': -1.0 }, { 'max': 1.0 })) crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob) crf_loss = crf_layer.crf_loss(predictions_blob, labels_blob) model.net.AddGradientOperators([crf_loss]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) loss = workspace.FetchBlob(str(crf_loss)) predictions = workspace.FetchBlob(str(predictions_blob)) np.testing.assert_allclose( loss, self._compute_loss_manual(predictions, num_tags, labels, transitions), atol=0.001, rtol=0.001, err_msg='CRF LOSS is not matching the reference')
def MILSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, ), memory_optimization=False, forget_bias=0.0): ''' Adds MI flavor of standard LSTM recurrent network operator to a model. See https://arxiv.org/pdf/1606.06630.pdf model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation memory_optimization: if enabled, the LSTM step is recomputed on backward step so that we don't need to store forward activations for each timestep. Saves memory with cost of computation. ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='milstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) # hU^T # Shape: [1, batch_size, 4 * hidden_size] prev_t = step_model.FC(hidden_t_prev, s('prev_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) # defining MI parameters alpha = step_model.param_init_net.ConstantFill([], [s('alpha')], shape=[4 * dim_out], value=1.0) beta1 = step_model.param_init_net.ConstantFill([], [s('beta1')], shape=[4 * dim_out], value=1.0) beta2 = step_model.param_init_net.ConstantFill([], [s('beta2')], shape=[4 * dim_out], value=1.0) b = step_model.param_init_net.ConstantFill([], [s('b')], shape=[4 * dim_out], value=0.0) model.params.extend([alpha, beta1, beta2, b]) # alpha * (xW^T * hU^T) # Shape: [1, batch_size, 4 * hidden_size] alpha_tdash = step_model.net.Mul([prev_t, input_t], s('alpha_tdash')) # Shape: [batch_size, 4 * hidden_size] alpha_tdash_rs, _ = step_model.net.Reshape( alpha_tdash, [s('alpha_tdash_rs'), s('alpha_tdash_old_shape')], shape=[-1, 4 * dim_out], ) alpha_t = step_model.net.Mul([alpha_tdash_rs, alpha], s('alpha_t'), broadcast=1, use_grad_hack=1) # beta1 * hU^T # Shape: [batch_size, 4 * hidden_size] prev_t_rs, _ = step_model.net.Reshape( prev_t, [s('prev_t_rs'), s('prev_t_old_shape')], shape=[-1, 4 * dim_out], ) beta1_t = step_model.net.Mul([prev_t_rs, beta1], s('beta1_t'), broadcast=1, use_grad_hack=1) # beta2 * xW^T # Shape: [batch_szie, 4 * hidden_size] input_t_rs, _ = step_model.net.Reshape( input_t, [s('input_t_rs'), s('input_t_old_shape')], shape=[-1, 4 * dim_out], ) beta2_t = step_model.net.Mul([input_t_rs, beta2], s('beta2_t'), broadcast=1, use_grad_hack=1) # Add 'em all up gates_tdash = step_model.net.Sum([alpha_t, beta1_t, beta2_t], s('gates_tdash')) gates_t = step_model.net.Add([gates_tdash, b], s('gates_t'), broadcast=1, use_grad_hack=1) # # Shape: [1, batch_size, 4 * hidden_size] gates_t_rs, _ = step_model.net.Reshape( gates_t, [s('gates_t_rs'), s('gates_t_old_shape')], shape=[1, -1, 4 * dim_out], ) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t_rs, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], forget_bias=forget_bias, ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=[gates_t] if memory_optimization else None) return output, last_output, all_states, last_state
def LSTMWithAttention( model, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, encoder_output_dim, encoder_outputs, decoder_input_dim, decoder_state_dim, scope, attention_type=AttentionType.Regular, outputs_with_grads=(0, 4), weighted_encoder_outputs=None, lstm_memory_optimization=False, attention_memory_optimization=False, forget_bias=0.0, ): ''' Adds a LSTM with attention mechanism to a model. The implementation is based on https://arxiv.org/abs/1409.0473, with a small difference in the order how we compute new attention context and new hidden state, similarly to https://arxiv.org/abs/1508.04025. The model uses encoder-decoder naming conventions, where the decoder is the sequence the op is iterating over, while computing the attention context over the encoder. model: CNNModelHelper object new operators would be added to decoder_inputs: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention decoder_input_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_decoder_hidden_state: initial hidden state of LSTM initial_decoder_cell_state: initial cell state of LSTM initial_attention_weighted_encoder_context: initial attention context encoder_output_dim: dimension of encoder outputs encoder_outputs: the sequence, on which we compute the attention context at every iteration decoder_input_dim: input dimention (last dimension on decoder_inputs) decoder_state_dim: size of hidden states of LSTM attention_type: One of: AttentionType.Regular, AttentionType.Recurrent. Determines which type of attention mechanism to use. outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation weighted_encoder_outputs: encoder outputs to be used to compute attention weights. In the basic case it's just linear transformation of encoder outputs (that the default, when weighted_encoder_outputs is None). However, it can be something more complicated - like a separate encoder network (for example, in case of convolutional encoder) lstm_memory_optimization: recompute LSTM activations on backward pass, so we don't need to store their values in forward passes attention_memory_optimization: recompute attention for backward pass ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) decoder_inputs = model.FC( decoder_inputs, s('i2h'), dim_in=decoder_input_dim, dim_out=4 * decoder_state_dim, axis=2, ) # [batch_size, encoder_output_dim, encoder_length] encoder_outputs_transposed = model.Transpose( encoder_outputs, s('encoder_outputs_transposed'), axes=[1, 2, 0], ) if weighted_encoder_outputs is None: weighted_encoder_outputs = model.FC( encoder_outputs, s('weighted_encoder_outputs'), dim_in=encoder_output_dim, dim_out=encoder_output_dim, axis=2, ) step_model = CNNModelHelper( name='lstm_with_attention_cell', param_model=model, ) ( input_t, timestep, cell_t_prev, hidden_t_prev, attention_weighted_encoder_context_t_prev, ) = (step_model.net.AddScopedExternalInputs( 'input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev', 'attention_weighted_encoder_context_t_prev', )) step_model.net.AddExternalInputs(encoder_outputs_transposed, weighted_encoder_outputs) gates_concatenated_input_t, _ = step_model.net.Concat( [hidden_t_prev, attention_weighted_encoder_context_t_prev], [ s('gates_concatenated_input_t'), s('_gates_concatenated_input_t_concat_dims'), ], axis=2, ) gates_t = step_model.FC( gates_concatenated_input_t, s('gates_t'), dim_in=decoder_state_dim + encoder_output_dim, dim_out=4 * decoder_state_dim, axis=2, ) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t_intermediate, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, decoder_input_lengths, timestep], ['hidden_t_intermediate', s('cell_t')], forget_bias=forget_bias, ) if attention_type == AttentionType.Recurrent: attention_weighted_encoder_context_t, _, attention_blobs = apply_recurrent_attention( model=step_model, encoder_output_dim=encoder_output_dim, encoder_outputs_transposed=encoder_outputs_transposed, weighted_encoder_outputs=weighted_encoder_outputs, decoder_hidden_state_t=hidden_t_intermediate, decoder_hidden_state_dim=decoder_state_dim, scope=scope, attention_weighted_encoder_context_t_prev=( attention_weighted_encoder_context_t_prev), ) else: attention_weighted_encoder_context_t, _, attention_blobs = apply_regular_attention( model=step_model, encoder_output_dim=encoder_output_dim, encoder_outputs_transposed=encoder_outputs_transposed, weighted_encoder_outputs=weighted_encoder_outputs, decoder_hidden_state_t=hidden_t_intermediate, decoder_hidden_state_dim=decoder_state_dim, scope=scope, ) hidden_t = step_model.Copy(hidden_t_intermediate, s('hidden_t')) step_model.net.AddExternalOutputs( cell_t, hidden_t, attention_weighted_encoder_context_t, ) recompute_blobs = [] if attention_memory_optimization: recompute_blobs.extend(attention_blobs) if lstm_memory_optimization: recompute_blobs.extend([gates_t]) return recurrent_net( net=model.net, cell_net=step_model.net, inputs=[ (input_t, decoder_inputs), ], initial_cell_inputs=[ (hidden_t_prev, initial_decoder_hidden_state), (cell_t_prev, initial_decoder_cell_state), ( attention_weighted_encoder_context_t_prev, initial_attention_weighted_encoder_context, ), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, attention_weighted_encoder_context_t_prev: (attention_weighted_encoder_context_t), }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=recompute_blobs, )
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, ), return_params=False, memory_optimization=False, forget_bias=0.0): ''' Adds a standard LSTM recurrent network operator to a model. model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation return_params: if True, will return a dictionary of parameters of the LSTM memory_optimization: if enabled, the LSTM step is recomputed on backward step so that we don't need to store forward activations for each timestep. Saves memory with cost of computation. ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='lstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) gates_t = step_model.FC(hidden_t_prev, s('gates_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], forget_bias=forget_bias, ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, recompute_blobs_on_backward=[gates_t] if memory_optimization else None) if return_params: params = { 'input': { 'weights': input_blob + "_w", 'biases': input_blob + '_b' }, 'recurrent': { 'weights': gates_t + "_w", 'biases': gates_t + '_b' } } return output, last_output, all_states, last_state, params else: return output, last_output, all_states, last_state
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0, )): ''' Adds a standard LSTM recurrent network operator to a model. model: CNNModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimention seq_lengths: blob containing sequence lengths which would be passed to LSTMUnit operator initial_states: a tupple of (hidden_input_blob, cell_input_blob) which are going to be inputs to the cell net on the first iteration dim_in: input dimention dim_out: output dimention outputs_with_grads : position indices of output blobs which will receive external error gradient during backpropagation ''' def s(name): # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) """ initial bulk fully-connected """ input_blob = model.FC(input_blob, s('i2h'), dim_in=dim_in, dim_out=4 * dim_out, axis=2) """ the step net """ step_model = CNNModelHelper(name='lstm_cell', param_model=model) input_t, timestep, cell_t_prev, hidden_t_prev = ( step_model.net.AddScopedExternalInputs('input_t', 'timestep', 'cell_t_prev', 'hidden_t_prev')) gates_t = step_model.FC(hidden_t_prev, s('gates_t'), dim_in=dim_out, dim_out=4 * dim_out, axis=2) step_model.net.Sum([gates_t, input_t], gates_t) hidden_t, cell_t = step_model.net.LSTMUnit( [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep], [s('hidden_t'), s('cell_t')], ) step_model.net.AddExternalOutputs(cell_t, hidden_t) """ recurrent network """ (hidden_input_blob, cell_input_blob) = initial_states output, last_output, all_states, last_state = recurrent_net( net=model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[ (hidden_t_prev, hidden_input_blob), (cell_t_prev, cell_input_blob), ], links={ hidden_t_prev: hidden_t, cell_t_prev: cell_t, }, timestep=timestep, scope=scope, outputs_with_grads=outputs_with_grads, ) return output, last_output, all_states, last_state