def process_history(hist, inp): wk = C.slice(hist, 0, 0, myConfig['wg_dim']) wn = hist[myConfig['wg_dim']:] hist_processed = embed_layer(wk, wn) out_logits = s2smodel(hist_processed, inp) hamax = C.reshape(C.hardmax(out_logits), (-1, )) return hamax
def model(self): token_axis = C.Axis.new_unique_dynamic_axis('token_axis') b = C.Axis.default_batch_axis() token = C.input_variable(self.word_dim, dynamic_axes=[b, token_axis], name='token') # 8 classes emotion = C.input_variable(self.num_emotions, dynamic_axes=[b], name='emotion') processed_token = self.input_layer(token) att = self.attention_layer(processed_token, processed_token, 'attention') test = C.sequence.last(att) test = C.layers.Stabilizer()(test) test_w = C.parameter(shape=(2*self.hidden_dim, self.num_emotions), init=C.glorot_uniform()) test_v = C.parameter(shape=(self.num_emotions), init=C.glorot_uniform()) out = C.softmax(C.times(test, test_w) + test_v) loss = C.binary_cross_entropy(out, emotion) f1 = C.losses.fmeasure(C.hardmax(out), emotion) return out, loss, f1
def test_Hardmax(tmpdir): data = np.asarray([1., 1., 2., 3.], dtype=np.float32) model = C.hardmax(data) verify_no_input(model, tmpdir, 'Hardmax_0')
def output_layer(self, embed, attention_context, model_context, aw, q_processed, c_processed,cw): cw_ph=C.placeholder() att_context = C.placeholder(shape=(8*self.hidden_dim,)) query_processed = C.placeholder(shape=(2*self.hidden_dim,)) context_processed = C.placeholder(shape=(2*self.hidden_dim,)) mod_context = C.placeholder(shape=(2*self.hidden_dim)) a_onehot = C.placeholder(shape=(self.vocab_size+1,)) start_logits = C.layers.Dense(1, name='out_start')(C.dropout(C.splice(mod_context, att_context), self.dropout)) start_hardmax = seq_hardmax(start_logits) att_mod_ctx = C.sequence.last(C.sequence.gather(mod_context, start_hardmax)) att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout(C.splice(m2, att_context), self.dropout)) start_flag = C.hardmax(start_logits) end_flag = C.hardmax(end_logits) def create_model(): # Encoder: (input*) --> (h0, c0) # Create multiple layers of LSTMs by passing the output of the i-th layer # to the (i+1)th layer as its input with C.layers.default_options(enable_self_stabilization=True, go_backwards=False): LastRecurrence = C.layers.Recurrence encode = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) encode_c = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) # Decoder: (history*, input*) --> unnormalized_word_logp* # where history is one of these, delayed by 1 step and <s> prepended: # - training: labels # - testing: its own output hardmax(z) (greedy decoder) with C.layers.default_options(enable_self_stabilization=True): # sub-layers stab_in = C.layers.Stabilizer() rec_blocks = [C.layers.LSTM(self.hidden_dim) for i in range(self.num_layers)] stab_out = C.layers.Stabilizer() proj_out = C.layers.Dense(self.vocab_size+1, name='out_proj') # attention model attention_model = C.layers.AttentionModel(self.attention_dim, name='attention_model') # :: (h_enc*, h_dec) -> (h_dec augmented) hstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) cstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) W_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) U_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) V_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) maxout = C.layers.MaxPooling((2,), strides=2) # layer function @C.Function def decode(history, q, c, start_logits, end_logits): q = encode(q) c = encode_c(C.splice(c, start_logits, end_logits, axis=0)) r = history r = stab_in(r) q_last_h = C.sequence.last(q.outputs[0]) q_last_c = C.sequence.last(q.outputs[1]) c_last_h = C.sequence.last(c.outputs[0]) c_last_c = C.sequence.last(c.outputs[1]) initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h)) initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c)) rec_block = rec_blocks[0] # LSTM(hidden_dim) # :: (dh, dc, x) -> (h, c) @C.Function def find_embed(x): gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size) return embed(gx, ngx) @C.Function def lstm_with_attention(dh, dc, r, x): history_embed = find_embed(x) h_att = attention_model(c.outputs[0], dh) q_att = attention_model(q.outputs[0], dh) att = C.splice(h_att, q_att) x = C.splice(x, att) x, dc = rec_block(dh, dc, x).outputs # 0*r is a hack because cntk freaks out when r is not used. r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r #bug when W_dense is added first, wtf?! #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r return x, dc, r _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs r = maxout(r) r = stab_out(r) r = proj_out(r) #r = C.softmax(r) r = C.layers.Label('out_proj_out')(r) return r return decode def create_model_train(s2smodel): # model used in training (history is known from labels) # note: the labels must NOT contain the initial <s> @C.Function def model_train(labels, q, c, start_logits, end_logits): # (input*, labels*) --> (word_logp*) # The input to the decoder always starts with the special label sequence start token. # Then, use the previous value of the label sequence (for training) or the output (for execution). past_labels = C.layers.Delay(initial_state=self.sentence_start)(labels) return s2smodel(past_labels, q, c, start_logits, end_logits) return model_train def create_model_greedy(s2smodel): # model used in (greedy) decoding (inferencing) (history is decoder's own output) @C.Function def model_greedy(q, c, start_logits, end_logits): # (input*) --> (word_sequence*) # Decoding is an unfold() operation starting from sentence_start. # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*) # which holds 'input' in its closure. unfold = C.layers.UnfoldFrom(\ lambda history: s2smodel(history, q, c, start_logits, end_logits) >> C.hardmax, # stop once sentence_end_index was max-scoring output until_predicate=lambda w: w[...,self.sentence_end_index], length_increase=self.sentence_max_length) return unfold(initial_state=self.sentence_start, dynamic_axes_like=c) return model_greedy s2smodel = create_model() model_train = create_model_train(s2smodel)(a_onehot, query_processed, context_processed, start_logits, end_logits) model_greed = create_model_greedy(s2smodel)(query_processed, context_processed, start_logits, end_logits) model_greedy = C.argmax(model_greed,0) context = C.argmax(cw_ph,0) return C.as_block( C.combine((model_train, model_greedy, start_logits, end_logits,context)), [(att_context, attention_context), (mod_context, model_context), (a_onehot, aw), (query_processed, q_processed), (context_processed, c_processed),(cw_ph,cw)], 'attention_layer', 'attention_layer')
def test_Hardmax(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.asarray([1., 1., 2., 3.], dtype=dtype) model = C.hardmax(data) verify_no_input(model, tmpdir, 'Hardmax_0')
def test_Hardmax(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.asarray([1., 1., 2., 3.], dtype=dtype) model = C.hardmax(data) verify_no_input(model, tmpdir, 'Hardmax_0')
def create_network(input_vocab_dim, label_vocab_dim): # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') raw_input = sequence.input(shape=(input_vocab_dim), sequence_axis=input_seq_axis, name='raw_input') raw_labels = sequence.input(shape=(label_vocab_dim), sequence_axis=label_seq_axis, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output}) return { 'raw_input' : raw_input, 'raw_labels' : raw_labels, 'ce' : ce, 'pe' : errs, 'ng' : ng, 'output': z }