def bi_sru_layer(self, sru_1, index): f_1_f = C.sigmoid(sru_1[0 * self.param2:1 * self.param2] + self.list_bias[0 + index * 4]) r_1_f = C.sigmoid(sru_1[1 * self.param2:2 * self.param2] + self.list_bias[1 + index * 4]) c_1_f_r = (1 - f_1_f) * sru_1[2 * self.param2:3 * self.param2] dec_c_1_f = C.layers.ForwardDeclaration('f_' + str(index)) var_c_1_f = C.sequence.delay(dec_c_1_f, initial_state=0, time_step=1) nex_c_1_f = var_c_1_f * f_1_f + c_1_f_r dec_c_1_f.resolve_to(nex_c_1_f) h_1_f = r_1_f * C.tanh(nex_c_1_f) + ( 1 - r_1_f) * sru_1[3 * self.param2:4 * self.param2] f_1_b = C.sigmoid(sru_1[4 * self.param2:5 * self.param2] + self.list_bias[2 + index * 4]) r_1_b = C.sigmoid(sru_1[5 * self.param2:6 * self.param2] + self.list_bias[3 + index * 4]) c_1_b_r = (1 - f_1_b) * sru_1[6 * self.param2:7 * self.param2] dec_c_1_b = C.layers.ForwardDeclaration('b_' + str(index)) var_c_1_b = C.sequence.delay(dec_c_1_b, time_step=-1) nex_c_1_b = var_c_1_b * f_1_b + c_1_b_r dec_c_1_b.resolve_to(nex_c_1_b) h_1_b = r_1_b * C.tanh(nex_c_1_b) + ( 1 - r_1_b) * sru_1[7 * self.param2:8 * self.param2] x = C.splice(h_1_f, h_1_b) return x
def grid_lstm_func(m_t_1_k, m_tk_1, c_t_1_k, c_tk_1, x_tk): common_11 = C.times(m_t_1_k, W_t_im) + C.times( m_tk_1, W_k_im) + C.times(c_t_1_k, W_t_ic) + C.times( c_tk_1, W_k_ic) i_t_tk = C.sigmoid(C.times(x_tk, W_t_ix) + common_11 + b_t_i) i_k_tk = C.sigmoid(C.times(x_tk, W_k_ix) + common_11 + b_k_i) common_12 = C.times(m_t_1_k, W_t_fm) + C.times( m_tk_1, W_k_fm) + C.times(c_t_1_k, W_t_fc) + C.times( c_tk_1, W_k_fc) f_t_tk = C.sigmoid(C.times(x_tk, W_t_fx) + common_12 + b_t_f) f_k_tk = C.sigmoid(C.times(x_tk, W_k_fx) + common_12 + b_k_f) c_t_tk = C.element_times(f_t_tk, c_t_1_k) + C.element_times( i_t_tk, C.tanh( C.times(x_tk, W_t_cx) + C.times(m_t_1_k, W_t_cm) + C.times(m_tk_1, W_k_cm) + b_t_c)) # (13) c_k_tk = C.element_times(f_k_tk, c_tk_1) + C.element_times( i_k_tk, C.tanh( C.times(x_tk, W_k_cx) + C.times(m_t_1_k, W_t_cm) + C.times(m_tk_1, W_k_cm) + b_k_c)) # (14) common_15 = C.times(m_t_1_k, W_t_om) + C.times( m_tk_1, W_k_om) + C.times(c_t_tk, W_t_oc) + C.times( c_k_tk, W_k_oc) o_t_tk = C.sigmoid(C.times(x_tk, W_t_ox) + common_15 + b_t_o) o_k_tk = C.sigmoid(C.times(x_tk, W_k_ox) + common_15 + b_k_o) m_t_tk = C.element_times(o_t_tk, C.tanh(c_t_tk)) m_k_tk = C.element_times(o_k_tk, C.tanh(c_k_tk)) return (m_t_tk, m_k_tk, c_t_tk, c_k_tk)
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow( attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + ( h_enc_valid - 1 ) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax( u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att
def build_graph(self_attention, self_penalty, embeded_dim=60, h_dim=150, d_a=350, r=30): with C.layers.default_options(init=C.xavier()): embeded = C.layers.Embedding(embeded_dim)(x) embeded = C.layers.Stabilizer()(embeded) H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded) if self_attention: Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1") Ws2 = C.parameter(shape=(r, d_a), name="Ws2") A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H)))) H = C.times(A, H) # the M in the paper if self_penalty: I = C.constant(np.eye(r), dtype=np.float32) P = C.times_transpose(A, A) - I # r*r p = C.reduce_sum(C.abs(C.element_times( P, P))) # frobenius norm **2 y_ = C.layers.Dense(200, activation=C.ops.relu)(H) # y_pre = C.layers.Dense(num_labels, activation = None)(y_) def selfAtt(x): y_pre = C.layers.Dense(num_labels, activation=None)(y_) return y_pre if self_penalty: selfAtt.p = p return selfAtt
def attention_weight(h_enc, h_dec, inputs_dim): enc = C.layers.Dense(inputs_dim, name='out_start')(h_enc) dec = C.sequence.broadcast_as( C.layers.Dense(inputs_dim, name='out_start')(h_dec), enc) att_weight = C.layers.Dense(1, name='out_start')(C.tanh(enc + dec)) att_weight = C.sequence.softmax(att_weight) return att_weight
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output
def test_tanh_2(): cntk_op = C.tanh([0.]) cntk_ret = cntk_op.eval() ng_op, _ = CNTKImporter().import_model(cntk_op) ng_ret = ng.transformers.make_transformer().computation(ng_op)() assert np.isclose(cntk_ret, ng_ret).all()
def rnet_output_layer(self, attention_context, query): att_context = C.placeholder(shape=(2*self.hidden_dim,)) q_processed = C.placeholder(shape=(2*self.hidden_dim,)) wuq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) whp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wha = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) bias = C.parameter(shape=(2*self.hidden_dim), init=C.glorot_uniform()) whp_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wha_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) v_end = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # sequence[tensor[1]] q_len x 1 s0 = C.times(C.tanh(C.times(q_processed, wuq) + bias), v) a0 = C.sequence.softmax(s0) rQ = C.sequence.reduce_sum(a0 * q_processed) # sequence[tensor[1]] plen x 1 ts = C.reshape(C.times(C.tanh( C.times(att_context, whp) + C.times(C.sequence.broadcast_as(rQ, att_context), wha)), v), (-1)) # sequence[tensor[1]] ta = C.sequence.softmax(ts) # sequence[2d] 1 x 2d c0 = C.reshape(C.sequence.reduce_sum(ta * att_context), (2*self.hidden_dim)) # sequence[tensor[2d]] ha1 = C.layers.blocks.GRU(2*self.hidden_dim)(rQ, c0) # sequence[tensor[1]] plen x 1 s1 = C.reshape(C.times(C.tanh(C.times(att_context, whp_end) + C.times( C.sequence.broadcast_as(ha1, att_context), wha_end)), v_end), (-1)) # sequence[tensor[1]] plen x 1 a1 = C.sequence.softmax(s1) return C.as_block( C.combine([ts, s1]), [(att_context, attention_context), (q_processed, query)], 'output_layer', 'output_layer')
def test_tanh_3(): cntk_op = C.tanh( [-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.]) cntk_ret = cntk_op.eval() ng_op, _ = CNTKImporter().import_model(cntk_op) ng_ret = ng.transformers.make_transformer().computation(ng_op)() assert np.isclose(cntk_ret, ng_ret).all()
def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2)
def lstm_func(output_dim, cell_dim, x, input_dim, prev_state_h, prev_state_c): # input gate (t) it_w = C.times(C.parameter((cell_dim, input_dim)), x) it_b = C.parameter((cell_dim)) it_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h) it_c = C.parameter((cell_dim)) * prev_state_c it = C.sigmoid((it_w + it_b + it_h + it_c), name='it') # applied to tanh of input bit_w = C.times(C.parameter((cell_dim, input_dim)), x) bit_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h) bit_b = C.parameter((cell_dim)) bit = it * C.tanh(bit_w + (bit_h + bit_b)) # forget-me-not gate (t) ft_w = C.times(C.parameter((cell_dim, input_dim)), x) ft_b = C.parameter((cell_dim)) ft_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h) ft_c = C.parameter((cell_dim)) * prev_state_c ft = C.sigmoid((ft_w + ft_b + ft_h + ft_c), name='ft') # applied to cell(t-1) bft = ft * prev_state_c # c(t) = sum of both ct = bft + bit # output gate ot_w = C.times(C.parameter((cell_dim, input_dim)), x) ot_b = C.parameter((cell_dim)) ot_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h) ot_c = C.parameter((cell_dim)) * prev_state_c ot = C.sigmoid((ot_w + ot_b + ot_h + ot_c), name='ot') # applied to tanh(cell(t)) ht = ot * C.tanh(ct) # return cell value and hidden state return ct, ht
def lstm_func(output_dim, cell_dim, x, input_dim, prev_state_h, prev_state_c): # input gate (t) it_w = C.times(x,C.parameter((input_dim, cell_dim))) it_b = C.parameter((1,cell_dim)) it_h = C.times(prev_state_h,C.parameter((output_dim, cell_dim))) it_c = C.parameter((1,cell_dim)) * prev_state_c it = C.sigmoid((it_w + it_b + it_h + it_c), name='it') # applied to tanh of input bit_w = C.times(x,C.parameter((input_dim,cell_dim))) bit_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim))) bit_b = C.parameter((1,cell_dim)) bit = it * C.tanh(bit_w + (bit_h + bit_b)) # forget-me-not gate (t) ft_w = C.times(x, C.parameter((input_dim,cell_dim))) ft_b = C.parameter((1,cell_dim)) ft_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim))) ft_c = C.parameter((1,cell_dim)) * prev_state_c ft = C.sigmoid((ft_w + ft_b + ft_h + ft_c), name='ft') # applied to cell(t-1) bft = ft * prev_state_c # c(t) = sum of both ct = bft + bit # output gate ot_w = C.times(x, C.parameter((input_dim,cell_dim))) ot_b = C.parameter((1,cell_dim)) ot_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim))) ot_c = C.parameter((1,cell_dim)) * prev_state_c ot = C.sigmoid((ot_w + ot_b + ot_h + ot_c), name='ot') # applied to tanh(cell(t)) ht = ot * C.tanh(ct) # return cell value and hidden state return ct, ht
def createNetwork(self, inputEmb, preHidden, preMem): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub I = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) O = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) F = C.sigmoid( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3)) N = C.tanh( C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) + C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4)) NI = C.element_times(N, I) FM = C.element_times(F, preMem) CurMem = NI + FM CurH = C.element_times(C.tanh(CurMem), O) return (CurH, CurMem)
def unit(dh, dc, x): ''' dh: out_dim, dc:4096, x:input_dim''' proj4 = b + times(x, W) + times(dh, H) it_proj = proj4[0:1*stacked_dim] # split along stack_axis bit_proj = proj4[1*stacked_dim: 2*stacked_dim] ft_proj = proj4[2*stacked_dim: 3*stacked_dim] ot_proj = proj4[3*stacked_dim: 4*stacked_dim] it = C.sigmoid(it_proj) # input gate(t) # TODO: should both activations be replaced? bit = it * C.tanh(bit_proj) # applied to tanh of input network ft = C.sigmoid (ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = C.sigmoid (ot_proj) # output gate(t) ht = ot * C.tanh(ct) # applied to tanh(cell(t)) c = ct # cell value h = ht proj_h = C.times(h, proj_W) # out_dim return (proj_h, c)
def createNetwork(self, inputEmb, preHidden, preMem=None): WrX = C.times(inputEmb, self.Wr) + self.Wrb UrH = C.times(preHidden, self.Ur) R = C.sigmoid(WrX + UrH) WzX = C.times(inputEmb, self.Wz) + self.Wzb UzH = C.times(preHidden, self.Uz) Z = C.sigmoid(WzX + UzH) UH = C.times(preHidden, self.U) + self.Ub UHR = C.element_times(UH, R) WX = C.times(inputEmb, self.W) + self.Wb HTilde = C.tanh(WX + UHR) CurH = C.element_times(HTilde, 1 - Z) + C.element_times(preHidden, Z) return (CurH, None)
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys): """ inputs: shape=(n, dim) inputs_weight: shape=(dim, dim) decode: shape=(1, dec_dim) decode_weights: shape=(dec_dim, dim) keys: shape=(dim, 1) """ w_in = C.times(inputs, inputs_weights) #shape=(n, dim) w_dec = C.times(decode, decode_weights) #shape=(dim, 1) S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim) S = C.element_select(inputs_mask, S, C.constant(-1e+30)) S = C.times(S, keys) #shape=(n) S = C.ops.sequence.softmax(S, name="softmax") attention = C.reduce_sum(inputs * S, axis=0) return attention
def createNetwork(self, inputEmb, preHidden): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub R = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) Z = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) UHR = C.element_times( C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3), R) HTilde = C.tanh( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + UHR) CurH = C.element_times(HTilde, 1 - Z) + C.element_times(preHidden, Z) return CurH
def tanh(x, name=''): ''' Computes the element-wise tanh of `x`: The output tensor has the same shape as `x`. Example: >>> C.eval(C.tanh([[1,2],[3,4]])) [array([[[ 0.761594, 0.964028], [ 0.995055, 0.999329]]])] Args: x: numpy array or any :class:`cntk.Function` that outputs a tensor name (str): the name of the node in the network Returns: :class:`cntk.Function` ''' from cntk import tanh x = sanitize_input(x) return tanh(x, name).output()
def func(x_var): x = C.placeholder() WT = C.Parameter(( dim, dim, ), init=transform_weight_initializer, name=name + '_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name + '_bT') WU = C.Parameter(( dim, dim, ), init=update_weight_initializer, name=name + '_WU') bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.tanh(C.times(x, WU, name=name + '_U') + bU) return C.as_block(update * transform_gate + (1 - transform_gate) * x, [(x, x_var)], 'SingleInner', 'SingleInner' + name)
def createAttentionNet(self, hiddenSrc, curHiddenTrg, srcLength): srcHiddenSize = Config.SrcHiddenSize * 2 hsw = C.times(hiddenSrc, self.Was) htw = C.times(curHiddenTrg, self.Wat) hst = C.reshape( hsw, shape=(srcLength, Config.BatchSize * Config.TrgHiddenSize) ) + C.reshape(htw, shape=(1, Config.BatchSize * Config.TrgHiddenSize)) hstT = C.reshape(C.tanh(hst), shape=(srcLength * Config.BatchSize, Config.TrgHiddenSize)) attScore = C.reshape(C.times(hstT, self.Wav), shape=(srcLength, Config.BatchSize)) maskOut = (C.slice(self.maskMatrixSrc, 0, 0, srcLength) - 1) * 99999999 nAttScore = attScore + maskOut attProb = C.reshape(C.softmax(nAttScore, axis=0), shape=(srcLength, Config.BatchSize, 1)) attVector = hiddenSrc * attProb contextVector = C.reduce_sum(C.reshape( attVector, shape=(srcLength, Config.BatchSize * srcHiddenSize)), axis=0) contextVector = C.reshape(contextVector, shape=(1, Config.BatchSize, srcHiddenSize)) return (contextVector, attProb)
def test_Tanh(tmpdir): model = C.tanh([[1,2],[3,4]]) verify_no_input(model, tmpdir, 'Tanh_0')
def test_Tanh(tmpdir, dtype): with C.default_options(dtype=dtype): model = C.tanh(np.array([[1, 2], [3, 4]]).astype(dtype)) verify_no_input(model, tmpdir, 'Tanh_0')
def test_Tanh(tmpdir, dtype): with C.default_options(dtype = dtype): model = C.tanh(np.array([[1,2],[3,4]]).astype(dtype)) verify_no_input(model, tmpdir, 'Tanh_0')
#%% def true_density(z): z1, z2 = z[0], z[1] w1 = lambda x: C.sin(2 * np.pi * x/4) u = 0.5 * C.square((z2 - w1(z1))/0.4) dummy = C.ones_like(u) * 1e7 # u = C.element_select(C.less_equal(z1,4), u, dummy) cond = C.less_equal(z1,4) u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy return C.exp(-u) #%% h = lambda x: C.tanh(x) h_prime = lambda x: 1 - C.square(C.tanh(x)) base_dist = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.]) z_0 = C.input_variable(base_dist.size(), name='sampled') z_prev = z_0 sum_log_det_jacob = 0. initializer = C.initializer.uniform(1) for i in range(K): u = C.parameter((2), name='u', init=initializer) w = C.parameter((2), name='w', init=initializer) b = C.parameter((1), name='b', init=initializer) psi = h_prime(C.dot(w, z_prev)+b) * w det_jacob = C.abs(1 + C.dot(u, psi))
def LSTM(shape, _inf, cell_shape=None, use_peepholes=False, init=_default_initializer, init_bias=0, enable_self_stabilization=False): # (x, (h, c)) has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") if enable_self_stabilization: UntestedBranchError("LSTM, enable_self_stabilization option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape #stack_axis = -1 # stack_axis = 0 # BUGBUG: should be -1, i.e. the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_inf.shape + cell_shape_stacked, init=init, name='W') # input A = Parameter(_inf.shape + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = ParameterTensor( cell_shape + shape, init=init, init_value_scale=init_value_scale ) if has_projection else None # final projection Sdh = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) Sdc = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sct = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sht = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) def create_hc_placeholder(): return (Placeholder(_inf=_inf.with_shape(shape), name='hPh'), Placeholder(_inf=_inf.with_shape(cell_shape), name='cPh')) # (h, c) # parameters to model function x = Placeholder(_inf=_inf, name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) bit = it * tanh(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine([h, c]) # return to caller a helper function to create placeholders for recurrence apply_x_h_c.create_placeholder = create_hc_placeholder _name_and_extend_Function(apply_x_h_c, 'LSTM') return apply_x_h_c
def test_Tanh(tmpdir): model = C.tanh([[1, 2], [3, 4]]) verify_no_input(model, tmpdir, 'Tanh_0')
def test_tanh(): assert_cntk_ngraph_isclose(C.tanh([-2, -1., 0., 1., 2.])) assert_cntk_ngraph_isclose(C.tanh([0.])) assert_cntk_ngraph_isclose( C.tanh([-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.]))
def attention_weight(h_enc, h_dec, inputs_dim): enc = C.layers.Dense(inputs_dim, name='out_start')(h_enc) dec = C.sequence.broadcast_as(C.layers.Dense(inputs_dim, name='out_start')(h_dec), enc) att_weight = C.layers.Dense(1, name='out_start')(C.tanh(enc+dec)) att_weight = C.sequence.softmax(att_weight) return att_weight
def createDecoderInitNetwork(self, srcSentEmb): WIS = C.times(srcSentEmb, self.WI) + self.WIb return C.tanh(WIS)
def inner(a): return a * C.tanh(C.softplus(a))
def dcgan_generator(h): with C.layers.default_options(init=C.normal(0.02), pad=True, bias=False, map_rank=1, use_cntk_engine=True): h = C.reshape(h, (-1, 1, 1)) h = ConvolutionTranspose2D((4, 4), 1024, pad=False, strides=1, output_shape=(4, 4))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D( (5, 5), 512, strides=2, output_shape=(img_height // 32, img_width // 32))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D( (5, 5), 256, strides=2, output_shape=(img_height // 16, img_width // 16))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D( (5, 5), 128, strides=2, output_shape=(img_height // 8, img_width // 8))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D( (5, 5), 64, strides=2, output_shape=(img_height // 4, img_width // 4))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D( (5, 5), 32, strides=2, output_shape=(img_height // 2, img_width // 2))(h) h = BatchNormalization()(h) h = C.relu(h) h = ConvolutionTranspose2D((5, 5), 3, strides=2, bias=True, output_shape=(img_height, img_width))(h) h = C.tanh(h) return h
def gelu(x): return 0.5 * x * ( 1 + C.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * C.pow(x, 3))))