def _test_activation_coverage(act_type): config_coverage = sockeye.coverage.CoverageConfig( type=act_type, max_fertility=2, num_hidden=2, layer_normalization=False) encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4 # source: (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size,) source_length = mx.sym.Variable("source_length") # prev_hidden: (batch_size, decoder_num_hidden) prev_hidden = mx.sym.Variable("prev_hidden") # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden) prev_coverage = mx.sym.Variable("prev_coverage") # attention_scores: (batch_size, source_seq_len) attention_scores = mx.sym.Variable("attention_scores") source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size, ) prev_hidden_shape = (batch_size, decoder_num_hidden) attention_scores_shape = (batch_size, source_seq_len) prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden) source_data = gaussian_vector(shape=source_shape) source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len) prev_hidden_data = gaussian_vector(shape=prev_hidden_shape) prev_coverage_data = gaussian_vector(shape=prev_coverage_shape) attention_scores_data = uniform_vector(shape=attention_scores_shape) attention_scores_data = attention_scores_data / np.sum( attention_scores_data) coverage = sockeye.coverage.get_coverage(config_coverage) coverage_func = coverage.on(source, source_length, source_seq_len) updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage) executor = updated_coverage.simple_bind( ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, prev_hidden=prev_hidden_shape, prev_coverage=prev_coverage_shape, attention_scores=attention_scores_shape) executor.arg_dict["source"][:] = source_data executor.arg_dict["source_length"][:] = source_length_data executor.arg_dict["prev_hidden"][:] = prev_hidden_data executor.arg_dict["prev_coverage"][:] = prev_coverage_data executor.arg_dict["attention_scores"][:] = attention_scores_data result = executor.forward() new_coverage = result[0].asnumpy() assert new_coverage.shape == prev_coverage_shape # this is needed to modulate the 0 input. The output changes according to the activation type used. modulated = mx.nd.Activation(mx.nd.zeros((1, 1)), act_type=act_type).asnumpy() assert (np.sum( np.sum(np.isclose(new_coverage, modulated, atol=1.e-6), axis=2) != 0, axis=1) == source_length_data).all()
def test_coverage_attention(attention_coverage_type, attention_coverage_num_hidden, batch_size=3, encoder_num_hidden=2, decoder_num_hidden=2): # source: (batch_size, seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size, ) source_length = mx.sym.Variable("source_length") source_seq_len = 10 config_coverage = sockeye.coverage.CoverageConfig(type=attention_coverage_type, num_hidden=attention_coverage_num_hidden, layer_normalization=False) config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage", num_hidden=5, input_previous_word=False, source_num_hidden=encoder_num_hidden, query_num_hidden=decoder_num_hidden, layer_normalization=False, config_coverage=config_coverage) attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len) attention_state = attention.get_initial_state(source_length, source_seq_len) attention_func = attention.on(source, source_length, source_seq_len) attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state")) attention_state = attention_func(attention_input, attention_state) sym = mx.sym.Group([attention_state.context, attention_state.probs, attention_state.dynamic_source]) source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size,) decoder_state_shape = (batch_size, decoder_num_hidden) executor = sym.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, decoder_state=decoder_state_shape) source_length_vector = integer_vector(shape=source_length_shape, max_value=source_seq_len) executor.arg_dict["source"][:] = gaussian_vector(shape=source_shape) executor.arg_dict["source_length"][:] = source_length_vector executor.arg_dict["decoder_state"][:] = gaussian_vector(shape=decoder_state_shape) exec_output = executor.forward() context_result = exec_output[0].asnumpy() attention_prob_result = exec_output[1].asnumpy() dynamic_source_result = exec_output[2].asnumpy() expected_probs = (1. / source_length_vector).reshape((batch_size, 1)) assert context_result.shape == (batch_size, encoder_num_hidden) assert attention_prob_result.shape == (batch_size, source_seq_len) assert dynamic_source_result.shape == (batch_size, source_seq_len, attention_coverage_num_hidden) assert (np.sum(np.isclose(attention_prob_result, expected_probs), axis=1) == source_length_vector).all()
def _test_gru_coverage(): config_coverage = sockeye.coverage.CoverageConfig( type="gru", num_hidden=2, layer_normalization=False) encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4 # source: (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size,) source_length = mx.sym.Variable("source_length") # prev_hidden: (batch_size, decoder_num_hidden) prev_hidden = mx.sym.Variable("prev_hidden") # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden) prev_coverage = mx.sym.Variable("prev_coverage") # attention_scores: (batch_size, source_seq_len) attention_scores = mx.sym.Variable("attention_scores") source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size, ) prev_hidden_shape = (batch_size, decoder_num_hidden) attention_scores_shape = (batch_size, source_seq_len) prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden) source_data = gaussian_vector(shape=source_shape) source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len) prev_hidden_data = gaussian_vector(shape=prev_hidden_shape) prev_coverage_data = gaussian_vector(shape=prev_coverage_shape) attention_scores_data = uniform_vector(shape=attention_scores_shape) attention_scores_data = attention_scores_data / np.sum( attention_scores_data) coverage = sockeye.coverage.get_coverage(config_coverage) coverage_func = coverage.on(source, source_length, source_seq_len) updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage) executor = updated_coverage.simple_bind( ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, prev_hidden=prev_hidden_shape, prev_coverage=prev_coverage_shape, attention_scores=attention_scores_shape) executor.arg_dict["source"][:] = source_data executor.arg_dict["source_length"][:] = source_length_data executor.arg_dict["prev_hidden"][:] = prev_hidden_data executor.arg_dict["prev_coverage"][:] = prev_coverage_data executor.arg_dict["attention_scores"][:] = attention_scores_data result = executor.forward() new_coverage = result[0].asnumpy() assert new_coverage.shape == prev_coverage_shape assert (np.sum(np.sum(new_coverage != 1, axis=2) != 0, axis=1) == source_length_data).all()
def _test_activation_coverage(act_type): config_coverage = sockeye.coverage.CoverageConfig(type=act_type, num_hidden=2, layer_normalization=False) encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4 # source: (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size,) source_length = mx.sym.Variable("source_length") # prev_hidden: (batch_size, decoder_num_hidden) prev_hidden = mx.sym.Variable("prev_hidden") # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden) prev_coverage = mx.sym.Variable("prev_coverage") # attention_scores: (batch_size, source_seq_len) attention_scores = mx.sym.Variable("attention_scores") source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size,) prev_hidden_shape = (batch_size, decoder_num_hidden) attention_scores_shape = (batch_size, source_seq_len) prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden) source_data = gaussian_vector(shape=source_shape) source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len) prev_hidden_data = gaussian_vector(shape=prev_hidden_shape) prev_coverage_data = gaussian_vector(shape=prev_coverage_shape) attention_scores_data = uniform_vector(shape=attention_scores_shape) attention_scores_data = attention_scores_data / np.sum(attention_scores_data) coverage = sockeye.coverage.get_coverage(config_coverage) coverage_func = coverage.on(source, source_length, source_seq_len) updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage) executor = updated_coverage.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, prev_hidden=prev_hidden_shape, prev_coverage=prev_coverage_shape, attention_scores=attention_scores_shape) executor.arg_dict["source"][:] = source_data executor.arg_dict["source_length"][:] = source_length_data executor.arg_dict["prev_hidden"][:] = prev_hidden_data executor.arg_dict["prev_coverage"][:] = prev_coverage_data executor.arg_dict["attention_scores"][:] = attention_scores_data result = executor.forward() new_coverage = result[0].asnumpy() assert new_coverage.shape == prev_coverage_shape # this is needed to modulate the 0 input. The output changes according to the activation type used. modulated = mx.nd.Activation(mx.nd.zeros((1, 1)), act_type=act_type).asnumpy() assert (np.sum(np.sum(np.isclose(new_coverage, modulated, atol=1.e-6), axis=2) != 0, axis=1) == source_length_data).all()
def _test_gru_coverage(): config_coverage = sockeye.coverage.CoverageConfig(type="gru", num_hidden=2, layer_normalization=False) encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4 # source: (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size,) source_length = mx.sym.Variable("source_length") # prev_hidden: (batch_size, decoder_num_hidden) prev_hidden = mx.sym.Variable("prev_hidden") # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden) prev_coverage = mx.sym.Variable("prev_coverage") # attention_scores: (batch_size, source_seq_len) attention_scores = mx.sym.Variable("attention_scores") source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size,) prev_hidden_shape = (batch_size, decoder_num_hidden) attention_scores_shape = (batch_size, source_seq_len) prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden) source_data = gaussian_vector(shape=source_shape) source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len) prev_hidden_data = gaussian_vector(shape=prev_hidden_shape) prev_coverage_data = gaussian_vector(shape=prev_coverage_shape) attention_scores_data = uniform_vector(shape=attention_scores_shape) attention_scores_data = attention_scores_data / np.sum(attention_scores_data) coverage = sockeye.coverage.get_coverage(config_coverage) coverage_func = coverage.on(source, source_length, source_seq_len) updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage) executor = updated_coverage.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, prev_hidden=prev_hidden_shape, prev_coverage=prev_coverage_shape, attention_scores=attention_scores_shape) executor.arg_dict["source"][:] = source_data executor.arg_dict["source_length"][:] = source_length_data executor.arg_dict["prev_hidden"][:] = prev_hidden_data executor.arg_dict["prev_coverage"][:] = prev_coverage_data executor.arg_dict["attention_scores"][:] = attention_scores_data result = executor.forward() new_coverage = result[0].asnumpy() assert new_coverage.shape == prev_coverage_shape assert (np.sum(np.sum(new_coverage != 1, axis=2) != 0, axis=1) == source_length_data).all()
def test_step(cell_type, context_gating, num_embed=2, encoder_num_hidden=5, decoder_num_hidden=5): vocab_size, batch_size, source_seq_len = 10, 10, 7, # (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") source_shape = (batch_size, source_seq_len, encoder_num_hidden) # (batch_size,) source_length = mx.sym.Variable("source_length") source_length_shape = (batch_size,) # (batch_size, num_embed) word_vec_prev = mx.sym.Variable("word_vec_prev") word_vec_prev_shape = (batch_size, num_embed) # (batch_size, decoder_num_hidden) hidden_prev = mx.sym.Variable("hidden_prev") hidden_prev_shape = (batch_size, decoder_num_hidden) # List(mx.sym.Symbol(batch_size, decoder_num_hidden) states_shape = (batch_size, decoder_num_hidden) config_coverage = sockeye.coverage.CoverageConfig(type="tanh", num_hidden=2, layer_normalization=False) config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage", num_hidden=2, input_previous_word=False, source_num_hidden=decoder_num_hidden, query_num_hidden=decoder_num_hidden, layer_normalization=False, config_coverage=config_coverage) attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len) attention_state = attention.get_initial_state(source_length, source_seq_len) attention_func = attention.on(source, source_length, source_seq_len) config_rnn = sockeye.rnn.RNNConfig(cell_type=cell_type, num_hidden=decoder_num_hidden, num_layers=1, dropout_inputs=0., dropout_states=0., residual=False, forget_bias=0.) config_decoder = sockeye.decoder.RecurrentDecoderConfig(max_seq_len_source=source_seq_len, rnn_config=config_rnn, attention_config=config_attention, context_gating=context_gating) decoder = sockeye.decoder.RecurrentDecoder(config=config_decoder) if cell_type == C.GRU_TYPE: layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers)] elif cell_type == C.LSTM_TYPE: layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers*2)] else: raise ValueError state, attention_state = decoder._step(word_vec_prev=word_vec_prev, state=sockeye.decoder.RecurrentDecoderState(hidden_prev, layer_states), attention_func=attention_func, attention_state=attention_state) sym = mx.sym.Group([state.hidden, attention_state.probs, attention_state.dynamic_source]) executor = sym.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, word_vec_prev=word_vec_prev_shape, hidden_prev=hidden_prev_shape) executor.arg_dict["source"][:] = gaussian_vector(source_shape) executor.arg_dict["source_length"][:] = integer_vector(source_length_shape, source_seq_len) executor.arg_dict["word_vec_prev"][:] = gaussian_vector(word_vec_prev_shape) executor.arg_dict["hidden_prev"][:] = gaussian_vector(hidden_prev_shape) executor.arg_dict["states"] = layer_states hidden_result, attention_probs_result, attention_dynamic_source_result = executor.forward() assert hidden_result.shape == hidden_prev_shape assert attention_probs_result.shape == (batch_size, source_seq_len) assert attention_dynamic_source_result.shape == (batch_size, source_seq_len, config_coverage.num_hidden)
def test_step(cell_type, context_gating, num_embed=2, encoder_num_hidden=5, decoder_num_hidden=5): vocab_size, batch_size, source_seq_len = 10, 10, 7, # (batch_size, source_seq_len, encoder_num_hidden) source = mx.sym.Variable("source") source_shape = (batch_size, source_seq_len, encoder_num_hidden) # (batch_size,) source_length = mx.sym.Variable("source_length") source_length_shape = (batch_size, ) # (batch_size, num_embed) word_vec_prev = mx.sym.Variable("word_vec_prev") word_vec_prev_shape = (batch_size, num_embed) # (batch_size, decoder_num_hidden) hidden_prev = mx.sym.Variable("hidden_prev") hidden_prev_shape = (batch_size, decoder_num_hidden) # List(mx.sym.Symbol(batch_size, decoder_num_hidden) states_shape = (batch_size, decoder_num_hidden) config_coverage = sockeye.coverage.CoverageConfig( type="tanh", num_hidden=2, layer_normalization=False) config_attention = sockeye.rnn_attention.AttentionConfig( type="coverage", num_hidden=2, input_previous_word=False, source_num_hidden=decoder_num_hidden, query_num_hidden=decoder_num_hidden, layer_normalization=False, config_coverage=config_coverage) attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len) attention_state = attention.get_initial_state(source_length, source_seq_len) attention_func = attention.on(source, source_length, source_seq_len) config_rnn = sockeye.rnn.RNNConfig(cell_type=cell_type, num_hidden=decoder_num_hidden, num_layers=1, dropout_inputs=0., dropout_states=0., residual=False, forget_bias=0.) config_decoder = sockeye.decoder.RecurrentDecoderConfig( max_seq_len_source=source_seq_len, rnn_config=config_rnn, attention_config=config_attention, context_gating=context_gating) decoder = sockeye.decoder.RecurrentDecoder(config=config_decoder) if cell_type == C.GRU_TYPE: layer_states = [ gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers) ] elif cell_type == C.LSTM_TYPE: layer_states = [ gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers * 2) ] else: raise ValueError state, attention_state = decoder._step( word_vec_prev=word_vec_prev, state=sockeye.decoder.RecurrentDecoderState(hidden_prev, layer_states), attention_func=attention_func, attention_state=attention_state) sym = mx.sym.Group( [state.hidden, attention_state.probs, attention_state.dynamic_source]) executor = sym.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, word_vec_prev=word_vec_prev_shape, hidden_prev=hidden_prev_shape) executor.arg_dict["source"][:] = gaussian_vector(source_shape) executor.arg_dict["source_length"][:] = integer_vector( source_length_shape, source_seq_len) executor.arg_dict["word_vec_prev"][:] = gaussian_vector( word_vec_prev_shape) executor.arg_dict["hidden_prev"][:] = gaussian_vector(hidden_prev_shape) executor.arg_dict["states"] = layer_states hidden_result, attention_probs_result, attention_dynamic_source_result = executor.forward( ) assert hidden_result.shape == hidden_prev_shape assert attention_probs_result.shape == (batch_size, source_seq_len) assert attention_dynamic_source_result.shape == ( batch_size, source_seq_len, config_coverage.num_hidden)
def test_coverage_attention(attention_coverage_type, attention_coverage_num_hidden, batch_size=3, encoder_num_hidden=2, decoder_num_hidden=2): # source: (batch_size, seq_len, encoder_num_hidden) source = mx.sym.Variable("source") # source_length: (batch_size, ) source_length = mx.sym.Variable("source_length") source_seq_len = 10 config_coverage = sockeye.coverage.CoverageConfig( type=attention_coverage_type, num_hidden=attention_coverage_num_hidden, layer_normalization=False) config_attention = sockeye.rnn_attention.AttentionConfig( type="coverage", num_hidden=5, input_previous_word=False, source_num_hidden=encoder_num_hidden, query_num_hidden=decoder_num_hidden, layer_normalization=False, config_coverage=config_coverage) attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len) attention_state = attention.get_initial_state(source_length, source_seq_len) attention_func = attention.on(source, source_length, source_seq_len) attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state")) attention_state = attention_func(attention_input, attention_state) sym = mx.sym.Group([ attention_state.context, attention_state.probs, attention_state.dynamic_source ]) source_shape = (batch_size, source_seq_len, encoder_num_hidden) source_length_shape = (batch_size, ) decoder_state_shape = (batch_size, decoder_num_hidden) executor = sym.simple_bind(ctx=mx.cpu(), source=source_shape, source_length=source_length_shape, decoder_state=decoder_state_shape) source_length_vector = integer_vector(shape=source_length_shape, max_value=source_seq_len) executor.arg_dict["source"][:] = gaussian_vector(shape=source_shape) executor.arg_dict["source_length"][:] = source_length_vector executor.arg_dict["decoder_state"][:] = gaussian_vector( shape=decoder_state_shape) exec_output = executor.forward() context_result = exec_output[0].asnumpy() attention_prob_result = exec_output[1].asnumpy() dynamic_source_result = exec_output[2].asnumpy() expected_probs = (1. / source_length_vector).reshape((batch_size, 1)) assert context_result.shape == (batch_size, encoder_num_hidden) assert attention_prob_result.shape == (batch_size, source_seq_len) assert dynamic_source_result.shape == (batch_size, source_seq_len, attention_coverage_num_hidden) assert (np.sum(np.isclose(attention_prob_result, expected_probs), axis=1) == source_length_vector).all()