def create_detection_losses(cls_score, label_targets, rois, bbox_pred, bbox_targets, bbox_inside_weights): # classification loss cls_loss = cross_entropy_with_softmax(cls_score, label_targets, axis=1) p_cls_loss = placeholder() p_rois = placeholder() # The terms that are accounted for in the cls loss are those that correspond to an actual roi proposal --> do not count no-op (all-zero) rois roi_indicator = reduce_sum(p_rois, axis=1) cls_num_terms = reduce_sum(cntk.greater_equal(roi_indicator, 0.0)) cls_normalization_factor = 1.0 / cls_num_terms normalized_cls_loss = reduce_sum(p_cls_loss) * cls_normalization_factor reduced_cls_loss = cntk.as_block(normalized_cls_loss, [(p_cls_loss, cls_loss), (p_rois, rois)], 'Normalize', 'norm_cls_loss') # regression loss p_bbox_pred = placeholder() p_bbox_targets = placeholder() p_bbox_inside_weights = placeholder() bbox_loss = SmoothL1Loss(cfg["CNTK"].SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0) # The bbox loss is normalized by the batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].BATCH_SIZE normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor reduced_bbox_loss = cntk.as_block(normalized_bbox_loss, [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)], 'SmoothL1Loss', 'norm_bbox_loss') detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses") return detection_losses
def multi_headed_self_attention_layer(in_dims: int, hidden_dims: int, num_of_head: int, name='multi_headed_self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') outputs = [] if k_ph is False and v_ph is False: for i in range(num_of_head): layer = self_attention_layer(in_dims, hidden_dims, name=name + str(i), as_block=not as_block, mask_opt=mask_opt) outputs.append(layer(X)) elif k_ph is True and v_ph is True: k_ = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') # -3: sequence axis v_ = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') for i in range(num_of_head): layer = self_attention_layer(in_dims, in_dims, name=name + str(i), as_block=not as_block, k_ph=k_ph, v_ph=v_ph) outputs.append(layer(X, k_, v_)) else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') concat = C.splice(*outputs, name='concat') result = C.layers.Dense(in_dims, name='W_o')(concat) # init = C.initializer.normal(1) # W_O = C.parameter((in_dims, hidden_dims*num_of_head), init=init, name=name+'_Wo') # result = C.times_transpose(concat, W_O, name='result') if as_block is True: if k_ph is False and v_ph is False: result = C.as_block(result, [(X, X)], 'multi_headed_self_attetion', 'multi_headed_self_attetion_') elif k_ph is True and v_ph is True: result = C.as_block(result, [(X, X), (k_, k_), (v_, v_)], 'multi_headed_self_attetion', 'multi_headed_self_attetion_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') return result
def create_detection_losses(cls_score, label_targets, bbox_pred, rois, bbox_targets, bbox_inside_weights, cfg): # The losses are normalized by the batch size # classification loss p_cls_score = placeholder() p_label_targets = placeholder() cls_loss = cross_entropy_with_softmax(p_cls_score, p_label_targets, axis=1) cls_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS normalized_cls_loss = reduce_sum(cls_loss) * cls_normalization_factor reduced_cls_loss = cntk.as_block(normalized_cls_loss, [(p_cls_score, cls_score), (p_label_targets, label_targets)], 'CrossEntropyWithSoftmax', 'norm_cls_loss') # regression loss p_bbox_pred = placeholder() p_bbox_targets = placeholder() p_bbox_inside_weights = placeholder() bbox_loss = SmoothL1Loss(cfg.SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0) bbox_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor reduced_bbox_loss = cntk.as_block(normalized_bbox_loss, [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)], 'SmoothL1Loss', 'norm_bbox_loss') detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses") return detection_losses
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def triangular_matrix_seq(mode: int = 1): X = C.placeholder(1) ones = C.ones_like(X[0]) perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones) perm_2 = C.layers.Recurrence(C.plus, go_backwards=True, return_full_state=True)(ones) arr_1 = C.sequence.unpack(perm_1, 0, True) arr_2 = C.sequence.unpack(perm_2, 0, True) mat = C.times_transpose(arr_1, arr_2) mat_c = arr_1 * arr_2 diagonal_mat = mat - mat_c final_mat = diagonal_mat if mode == 0: final_mat = C.equal(final_mat, 0) elif mode == 1: final_mat = C.less_equal(final_mat, 0) elif mode == 2: final_mat = C.less(final_mat, 0) elif mode == -1: final_mat = C.greater_equal(final_mat, 0) elif mode == -2: final_mat = C.greater(final_mat, 0) result = C.as_block(final_mat, [(X, X)], 'triangular_matrix') return C.stop_gradient(result)
def input_layer(self,cgw,cnw,cc,qgw,qnw,qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), self.embed()(input_glove_words, input_nonglove_words), name='splice_embed') processed = C.layers.Sequential([For(range(2), lambda: OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn'))])(embedded) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)], 'input_layer', 'input_layer')
def output_layer(self, attention_context, modeling_context): att_context = C.placeholder() mod_context = C.placeholder() #output layer [#,c][1] start_logits = C.layers.Dense(1, name='out_start')(C.dropout( C.splice(mod_context, att_context), self.dropout)) start_logits = C.sequence.softmax(start_logits) start_hardmax = seq_hardmax(start_logits) # [000010000] att_mod_ctx = C.sequence.last( C.sequence.gather(mod_context, start_hardmax)) # [#][2*hidden_dim] att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) # [#, c][14*hidden_dim] m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout( C.splice(m2, att_context), self.dropout)) end_logits = C.sequence.softmax(end_logits) return C.as_block(C.combine([start_logits, end_logits]), [(att_context, attention_context), (mod_context, modeling_context)], 'output_layer', 'output_layer')
def test_block_with_unused_outputs(): p1 = C.placeholder() p3 = C.placeholder() func1 = C.as_block(p1 + 1, [(p1, p3)], 'plus_func_1') p2 = C.placeholder() p4 = C.placeholder() func2 = C.as_block(p2 + 1, [(p2, p4)], 'plus_func_2') p5 = C.placeholder() func3 = C.as_block(C.combine([func2]), [(p4, p5)], 'empty_block') input_var1 = C.input_variable(shape=()) input_var2 = C.input_variable(shape=()) block = C.as_block(C.combine([func1, func3]), [(p3, input_var1), (p5, input_var2)], 'multi_output_block') eval_root = C.combine([block.outputs[0]]) result = eval_root.eval({input_var1 : np.asarray([3], dtype=np.float32), input_var2 : np.asarray([-3], dtype=np.float32)}) assert np.array_equal(result, [ 4.])
def gated_attention_gru_layer(self, context, query): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) c_processed = C.placeholder(shape=(2*self.hidden_dim,)) #gate weight Wg = C.parameter(shape=(4*self.hidden_dim, 4*self.hidden_dim)) att_gru = C.layers.GRU(2*self.hidden_dim) attention_model = C.layers.AttentionModel(self.hidden_dim, name='attention_model') @C.Function def out_func0(att_input, enc_input): enc_input2 = enc_input @C.Function def gru_with_attentioin(dh, x): c_att = attention_model(att_input, x) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x) att_context = Recurrence(gru_with_attentioin)(enc_input2) return att_context att_context = out_func0(q_processed, c_processed) return C.as_block( att_context, [(c_processed, context), (q_processed, query)], 'gated_attention_gru_layer', 'gated_attention_gru_layer')
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax( C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block(att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def _func(x): input_ph = C.placeholder() ph = C.placeholder() onehot_value = C.one_hot(ph,262) x1 = C.times(onehot_value, self.char_embed) # [#,*][50,16] # x2 = self.convs[0](x1) # [#,*][32,50,1] convs_res = [] for i in range(self.filter_num): conv_res = self.convs[i](x1) convs_res.append(C.reshape(C.reduce_max(conv_res, axis=1),(-1,))) token_embed = C.splice(*convs_res) # [#,*][2048] tmp_res = token_embed for i in range(self.highway_num): tmp_res = self.highways[i](tmp_res) highway_out=tmp_res # [#,*][2048] proj_out = self.proj(highway_out) # [#,*][512] if not require_train: res = proj_out.clone(C.CloneMethod.freeze, {ph:input_ph}) else: res = proj_out.clone(C.CloneMethod.clone, {ph:input_ph}) return C.as_block( res,[(input_ph, x)], 'elmo_char_encoder', 'elmo_char_encoder' )
def matching_attention_layer(self, attention_context): att_context = C.placeholder(shape=(2*self.hidden_dim,)) #matching layer matching_model = C.layers.AttentionModel(attention_dim=self.hidden_dim, name='attention_model') #gate weight Wg = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim)) #gru att_gru = C.layers.GRU(self.hidden_dim) @C.Function def out_func1(att_input, enc_input): enc_input2 = enc_input @C.Function def bigru_with_match(dh, x): c_att = matching_model(att_input, dh) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x) return C.splice(C.layers.Recurrence(bigru_with_match)(enc_input2), C.layers.Recurrence(bigru_with_match, go_backwards=True)(enc_input2), name="bigru_with_match") match_context = out_func1(att_context, att_context) return C.as_block( match_context, [(att_context, attention_context)], 'matching_attention_layer', 'matching_attention_layer')
def output_layer(self, attention_context, modeling_context): att_context = C.placeholder(shape=(8 * self.hidden_dim, )) mod_context = C.placeholder(shape=(2 * self.hidden_dim, )) #output layer start_logits = C.layers.Dense(1, name='out_start')(C.dropout( C.splice(mod_context, att_context), self.dropout)) if self.two_step: start_hardmax = seq_hardmax(start_logits) att_mod_ctx = C.sequence.last( C.sequence.gather(mod_context, start_hardmax)) else: start_prob = C.softmax(start_logits) att_mod_ctx = C.sequence.reduce_sum(mod_context * start_prob) att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout( C.splice(m2, att_context), self.dropout)) return C.as_block(C.combine([start_logits, end_logits]), [(att_context, attention_context), (mod_context, modeling_context)], 'output_layer', 'output_layer')
def input_layer(self, cgw, cnw, qgw, qnw): cgw_ph = C.placeholder() cnw_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() input_glove_words = C.placeholder(shape=(self.wg_dim, )) input_nonglove_words = C.placeholder(shape=(self.wn_dim, )) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = self.word_glove()(input_glove_words, input_nonglove_words) highway = HighwayNetwork(dim=self.word_emb_dim, highway_layers=self.highway_layers)(embedded) highway_drop = C.layers.Dropout(self.dropout)(highway) processed = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn')(highway_drop) q_processed = processed.clone(C.CloneMethod.share, { input_glove_words: qgw_ph, input_nonglove_words: qnw_ph }) c_processed = processed.clone(C.CloneMethod.share, { input_glove_words: cgw_ph, input_nonglove_words: cnw_ph }) return C.as_block(C.combine([c_processed, q_processed]), [(cgw_ph, cgw), (cnw_ph, cnw), (qgw_ph, qgw), (qnw_ph, qnw)], 'input_layer', 'input_layer')
def BinaryConvolution(operand, filter_shape, num_filters=1, channels = 1, init=C.glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, op_name='BinaryConvolution', name=''): """ arguments: operand: tensor to convolve filter_shape: tuple indicating filter size num_filters: number of filters to use channels: number of incoming channels init: type of initialization to use for weights """ kernel_shape = (num_filters, channels) + filter_shape W = C.parameter(shape=kernel_shape, init=init, name="filter") binary_convolve_operand_p = C.placeholder(operand.shape, operand.dynamic_axes, name="operand") binary_convolve = C.convolution(CustomMultibit(W, 1), CustomMultibit(binary_convolve_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) r = C.as_block(binary_convolve, [(binary_convolve_operand_p, operand)], 'binary_convolve') bias_shape = (num_filters, 1, 1) b = C.parameter(shape=bias_shape, init=init_bias, name="bias") r = r + b # apply learnable param relu P = C.parameter(shape=r.shape, init=init, name="prelu") r = C.param_relu(P, r) return r
def modeling_layer(self, attention_context): att_context = C.placeholder(shape=(8 * self.hidden_dim, )) self._indrnn_builder._input_size = 8 * self.hidden_dim ind1 = [self._indrnn_builder.build(), self._indrnn_builder.build()] self._indrnn_builder._input_size = 2 * self.hidden_dim indrnns = [self._indrnn_builder.build() for _ in range(10)] indrnns = ind1 + indrnns #modeling layer 6 resnet layers model = C.layers.For( range(3), lambda i: C.layers.Sequential([ #C.layers.ResNetBlock( C.layers.Sequential([ C.layers.LayerNormalization() if self.use_layerbn else C.layers.identity, C.layers.Dropout(self.dropout), (C.layers.Recurrence(indrnns[4 * i]), C.layers.Recurrence(indrnns[4 * i + 1], go_backwards=True) ), C.splice, C.layers.LayerNormalization() if self.use_layerbn else C.layers.identity, C.layers.Dropout(self.dropout), (C.layers.Recurrence(indrnns[4 * i + 2]), C.layers.Recurrence(indrnns[4 * i + 3], go_backwards=True) ), C.splice ]) #) ])) mod_context = model(att_context) return C.as_block(mod_context, [(att_context, attention_context)], 'modeling_layer', 'modeling_layer')
def input_layer(self, embed, cgw,cnw,cc,qgw,qnw,qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = C.splice( embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), embed(input_glove_words, input_nonglove_words), name='splice_embed') highway = HighwayNetwork(dim=2*self.hidden_dim, highway_layers=self.highway_layers)(embedded) highway_drop = C.layers.Dropout(self.dropout)(highway) processed = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn')(highway_drop) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) # ace = C.one_hot(ac_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)], 'input_layer', 'input_layer')
def Block(f, op_name, name='', members={}, make_block=False): if make_block: inner_args = f.arguments args_map = [(arg, Placeholder(name=arg.name)) for arg in inner_args] f = as_block(f, args_map, op_name, name) for key in members: f.__dict__[key] = members[key] return f
def basic_network(cls, dims: int, op_name: str = '', instance_name: str = ''): ph = C.placeholder(dims, name='net_input') net = C.layers.Sequential([ C.layers.Dense(8, C.tanh), C.layers.Dense(8, C.tanh), C.layers.Dense(dims, name='net_output'), ])(ph) return C.as_block(net, [(ph,ph)], op_name, instance_name)
def positional_encoding(token_dims: int, discount_factor: float = 0.99): X = C.placeholder(token_dims, name='positional_encoding') encoder = C.layers.Recurrence(C.element_times, initial_state=1, return_full_state=True)(C.ones_like(X) * discount_factor) return C.stop_gradient( C.as_block(encoder, [(X, X)], 'positional_encoding', 'positional_encoding_'))
def func(x_var): x = C.placeholder() transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.relu(C.times(x, WU, name=name + '_U') + bU) return C.as_block( x + transform_gate * (update - x), # trans(x)*u(x)+(1-f(x))*x [(x, x_var)], 'HighwayBlock', 'HighwayBlock' + name)
def _func(x): ph = C.placeholder() first_out = encoder(ph) second_out, third_out = bilm(first_out).outputs # [#,*][1024] dup_first_out = C.splice(first_out, first_out) #[#,*][1024] s = C.softmax(scales) out = gamma*(s[0]*dup_first_out+s[1]*second_out+s[2]*third_out) return C.as_block( out, [(ph, x)],'Elmo', 'Elmo' )
def reasoning_layer(self, inputs): input_ph = C.placeholder() rnn = create_birnn(GRU(self.hidden_dim), GRU(self.hidden_dim), 'reasoning_gru') block = Sequential( [LayerNormalization(name='layerbn'), Dropout(self.dropout), rnn]) res = block(input_ph) return C.as_block(res, [(input_ph, inputs)], 'reasoning layer', 'reasoning layer')
def input_layer(self, cgw, cnw, cc, qgw, qnw, qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1, self.word_size, self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim, )) input_nonglove_words = C.placeholder(shape=(self.wn_dim, )) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) word_embed = self.word_glove()(input_glove_words, input_nonglove_words) char_embed = self.char_glove()(input_chars) embeded = C.splice(word_embed, C.reshape(self.charcnn(char_embed), self.convs), name='splice_embeded') self._indrnn_builder._input_size = self.word_emb_dim + self.convs ind1 = [self._indrnn_builder.build(), self._indrnn_builder.build()] self._indrnn_builder._input_size = 2 * self.hidden_dim indrnns = [self._indrnn_builder.build() for _ in range(4)] indrnns = ind1 + indrnns process = C.layers.For( range(3), lambda i: C.layers.Sequential([ C.layers.Dropout(self.dropout), (C.layers.Recurrence(indrnns[2 * i]), C.layers.Recurrence(indrnns[2 * i + 1], go_backwards=True)), C .splice ])) processed = process(embeded) q_processed = processed.clone( C.CloneMethod.share, { input_chars: qce, input_glove_words: qgw_ph, input_nonglove_words: qnw_ph }) c_processed = processed.clone( C.CloneMethod.share, { input_chars: cce, input_glove_words: cgw_ph, input_nonglove_words: cnw_ph }) return C.as_block(C.combine([c_processed, q_processed]), [(cgw_ph, cgw), (cnw_ph, cnw), (cc_ph, cc), (qgw_ph, qgw), (qnw_ph, qnw), (qc_ph, qc)], 'input_layer', 'input_layer')
def weighted_sum(self, inputs): input_ph = C.placeholder() weight = Sequential([ BatchNormalization(), Dropout(self.dropout), Dense(self.hidden_dim, activation=C.tanh), Dense(1, bias=False), C.sequence.softmax ])(input_ph) # [#,c][1] res = C.sequence.reduce_sum(weight * input_ph) return C.as_block(C.combine(res, weight), [(input_ph, inputs)], 'weighted sum', 'weighted sum')
def convert(root_func, filter, converter): ''' Clones the graph underlying root_func and in the clone substitutes all Functions obtained by applying 'filter', with a new Function obtained by calling the specified 'converter' Args: root_func: a root function of a graph to be cloned and converted filter: a lambda for filtering out the Functions to be converted converter: a lambda for obtaining the substitute for each of the Functions to be converted Returns: Cloned and converted Function (graph) ''' # recursively convert for blocks in root_func blocks = C.logging.graph.depth_first_search(root_func, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) for i in range(len(blocks)): # search for blocks again in case block input/output has been modified blocks1 = C.logging.graph.depth_first_search(root_func, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) block = blocks1[i] # assuming depth_first_search order to be stable, so use the old index on new search results block_root = C.as_composite(block.block_root) new_block_root = convert(block_root, filter, converter) if new_block_root != block_root: block_arguments_mapping = dict(block.block_arguments_mapping) new_block_arguments_mapping = [] for arg, new_arg in zip(block_root.arguments, new_block_root.arguments): new_block_arguments_mapping += [(new_arg, block_arguments_mapping[arg])] new_block = C.as_block(new_block_root, new_block_arguments_mapping, block.op_name, block.name) if all([x not in root_func.outputs for x in block.outputs]) or all([x in block.outputs for x in root_func.outputs]): root_func = root_func.clone(C.CloneMethod.share, dict(zip(block.outputs, new_block.outputs))) else: new_outputs = [new_block.outputs[block.outputs.index(x)] if x in block.outputs else None for x in root_func.outputs] root_func_nonreplaced = C.combine([x for x in root_func.outputs if x not in block.outputs]) root_func_nonreplaced_clone = root_func_nonreplaced.clone(C.CloneMethod.share, dict(zip(block.outputs, new_block.outputs))) idx = 0 for nonreplaced_output in root_func_nonreplaced_clone.outputs: while new_outputs[idx]: idx += 1 new_outputs[idx] = nonreplaced_output root_func = C.combine(new_outputs) # replace all Function instances under root_func that pass the specified 'filter' functions_to_convert = C.logging.graph.depth_first_search(root_func, filter, depth = 0) for function_to_convert in functions_to_convert: converted = converter(function_to_convert) if not function_to_convert.output in root_func.outputs: root_func = root_func.clone(C.CloneMethod.share, {function_to_convert.output : converted.output}) else: # if cudnn_rnn output is the root_func output, just use converted as root_func and no clone needed if len(root_func.outputs) > 1: root_func = C.combine([converted if x == function_to_convert.output else x for x in root_func.outputs]) else: root_func = converted return root_func
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def wrap_in_block(fun_args, name): block_args = [placeholder(name=arg.name) for arg in fun_args ] # placeholders inside the BlockFunction combined_block_args = combine( block_args) # the content of the BlockFunction arg_map = list( zip(block_args, fun_args)) # after wrapping, the block_args map to args combined_args = as_block(composite=combined_block_args, block_arguments_map=arg_map, block_op_name=name) return combined_args
def convolution(operand): bcv_operand_p = C.placeholder( operand.shape, operand.dynamic_axes, name="operand") bcv = C.convolution( CustomMultibit(W, 1), CustomMultibit(bcv_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) return C.as_block(bcv, [(bcv_operand_p, operand)], name)
def convolution(operand): bcv_operand_p = C.placeholder(operand.shape, operand.dynamic_axes, name="operand") bcv = C.convolution(CustomMultibit(W, 1), CustomMultibit(bcv_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) return C.as_block(bcv, [(bcv_operand_p, operand)], name)
def output_layer(self, query, match_context): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) mat_context = C.placeholder(shape=(2*self.hidden_dim,)) #output layer r_q = question_pooling(q_processed, 2*self.hidden_dim) #shape n*(2*self.hidden_dim) p1_logits = attention_weight(mat_context, r_q, 2*self.hidden_dim) attention_pool = C.sequence.reduce_sum(p1_logits * mat_context) state = C.layers.GRU(2*self.hidden_dim)(attention_pool, r_q) p2_logits = attention_weight(mat_context, state, 2*self.hidden_dim) @C.Function def start_ave_point(p1_logits, p2_logits, point): @C.Function def start_ave(last, now): now = now + last - last new_start = now * C.sequence.gather(p2_logits, point) point = C.sequence.future_value(point) return new_start start_logits_ave = C.layers.Recurrence(start_ave)(p1_logits) return start_logits_ave point = C.sequence.is_first(p1_logits) point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus))])(point) point = C.greater(C.constant(16), point) start_logits_ave = start_ave_point(p1_logits, p2_logits, point) @C.Function def end_ave_point(p1_logits, p2_logits, point): @C.Function def end_ave(last, now): now = now + last - last new_end = now * C.sequence.gather(p2_logits, point) point = C.sequence.past_value(point) return new_end end_logits_ave = C.layers.Recurrence(end_ave, go_backwards=True)(p2_logits) return end_logits_ave point = C.sequence.is_last(p1_logits) point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus, go_backwards=True))])(point) point = C.greater(C.constant(16),point) end_logits_ave = end_ave_point(p1_logits, p2_logits, point) start_logits = seq_hardmax(start_logits_ave) end_logits = seq_hardmax(end_logits_ave) ''' start_logits = seq_hardmax(p1_logits) end_logits = seq_hardmax(p2_logits) ''' return C.as_block( C.combine([start_logits, end_logits]), [(q_processed, query), (mat_context, match_context)], 'output_layer', 'output_layer')
def func(x_var): x = C.placeholder() WT = C.Parameter((dim,dim,), init=transform_weight_initializer, name=name+'_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name+'_bT') WU = C.Parameter((dim,dim,), init=update_weight_initializer, name=name+'_WU') bU = C.Parameter(dim, init=update_bias_initializer, name=name+'_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name+'_T') + bT) update = C.relu(C.times(x, WU, name=name+'_U') + bU) return C.as_block( x + transform_gate * (update - x), [(x, x_var)], 'HighwayBlock', 'HighwayBlock'+name)
def input_layer(self, cgw, cc, qgw, qc, qnw, cnw): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1, self.word_size, self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim, )) input_nonglove_words = C.placeholder(shape=(self.wn_dim, )) embedded = C.splice(C.reshape(self.charcnn(input_chars), self.convs), self.embed()(input_glove_words, input_nonglove_words), name='splice_embed') highway = HighwayNetwork(dim=self.elmo_dim + self.hidden_dim + self.convs, highway_layers=self.highway_layers)(embedded) highway_drop = C.layers.Dropout(self.dropout)(highway) processed = OptimizedRnnStack(self.hidden_dim, num_layers=1, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn')(highway_drop) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone( C.CloneMethod.share, { input_chars: qce, input_glove_words: qgw_ph, input_nonglove_words: qnw_ph }) c_processed = processed.clone( C.CloneMethod.share, { input_chars: cce, input_glove_words: cgw_ph, input_nonglove_words: cnw_ph }) return C.as_block(C.combine([c_processed, q_processed]), [(cgw_ph, cgw), (cc_ph, cc), (qgw_ph, qgw), (qc_ph, qc), (qnw_ph, qnw), (cnw_ph, cnw)], 'input_layer', 'input_layer')
def test_model_one_output_of_multi_output_function(): input_dim = 2 proj_dim = 11 x = input_variable((input_dim,)) x_placeholder = placeholder_variable() w = parameter((input_dim, proj_dim)) b = parameter((proj_dim,)) proj = times(x_placeholder, w) proj_plus_bias = proj + b combined_model = as_block(combine([proj, proj_plus_bias]), [(x_placeholder, x)], 'dense_op') labels = input_variable((proj_dim,)) lr_schedule = learning_rate_schedule(0.003, UnitType.sample) ce = cross_entropy_with_softmax(combined_model.outputs[0], labels) pe = classification_error(combined_model.outputs[0], labels) trainer_multitask = Trainer(combined_model.outputs[0], (ce, pe), sgd(ce.parameters, lr=lr_schedule))
def wrap_in_block(fun_args, name): block_args = [placeholder_variable(name=arg.name) for arg in fun_args] # placeholders inside the BlockFunction combined_block_args = combine(block_args) # the content of the BlockFunction arg_map = list(zip(block_args, fun_args)) # after wrapping, the block_args map to args combined_args = as_block(composite=combined_block_args, block_arguments_map=arg_map, block_op_name=name) return combined_args
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def convert(root_func, filter, converter): ''' Clones the graph underlying root_func and in the clone substitutes all Functions obtained by applying 'filter', with a new Function obtained by calling the specified 'converter' Args: root_func: a root function of a graph to be cloned and converted filter: a lambda for filtering out the Functions to be converted converter: a lambda for obtaining the substitute for each of the Functions to be converted Returns: Cloned and converted Function (graph) ''' # recursively convert for blocks in root_func blocks = C.logging.graph.depth_first_search(root_func, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) for i in range(len(blocks)): # search for blocks again in case block input/output has been modified blocks1 = C.logging.graph.depth_first_search(root_func, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) block = blocks1[i] # assuming depth_first_search order to be stable, so use the old index on new search results block_root = C.as_composite(block.block_root) new_block_root = convert(block_root, filter, converter) if new_block_root != block_root: block_arguments_mapping = dict(block.block_arguments_mapping) new_block_arguments_mapping = [] for arg, new_arg in zip(block_root.arguments, new_block_root.arguments): new_block_arguments_mapping += [(new_arg, block_arguments_mapping[arg])] new_block = C.as_block(new_block_root, new_block_arguments_mapping, block.op_name, block.name) if all([x not in root_func.outputs for x in block.outputs]) or all([x in block.outputs for x in root_func.outputs]): root_func = root_func.clone(C.CloneMethod.share, dict(zip(block.outputs, new_block.outputs))) else: new_outputs = [new_block.outputs[block.outputs.index(x)] if x in block.outputs else None for x in root_func.outputs] root_func_nonreplaced = C.combine([x for x in root_func.outputs if x not in block.outputs]) root_func_nonreplaced_clone = root_func_nonreplaced.clone(C.CloneMethod.share, dict(zip(block.outputs, new_block.outputs))) idx = 0 for nonreplaced_output in root_func_nonreplaced_clone.outputs: while new_outputs[idx]: idx += 1 new_outputs[idx] = nonreplaced_output root_func = C.combine(new_outputs) # replace all Function instances under root_func that pass the specified 'filter' functions_to_convert = C.logging.graph.depth_first_search(root_func, filter, depth = 0) for i in range(len(functions_to_convert)): # The graph could be modified already by this function, so we need to rescan to the new set. functions_to_convert1 = C.logging.graph.depth_first_search(root_func, filter, depth = 0) # We are using a filter passed in by the caller. So once a function is converted, we may not # get the same number of functions again, so we need to use correct index depending on the new size. index = 0 if len(functions_to_convert) > len(functions_to_convert1): assert(len(functions_to_convert) - len(functions_to_convert1) == i) # Only one conversion at a time. # index = 0 will work for this case, we are picking the first function from the new list. elif len(functions_to_convert) == len(functions_to_convert1): index = i # here we pick the current index of the for loop. else: raise RuntimeError("The conversion adds another possible conversion(s). Stopping infinite conversions.") function_to_convert = functions_to_convert1[index] converted = converter(function_to_convert) if not function_to_convert.output in root_func.outputs: root_func = root_func.clone(C.CloneMethod.share, {function_to_convert.output : converted.output}) else: # if cudnn_rnn output is the root_func output, just use converted as root_func and no clone needed if len(root_func.outputs) > 1: root_func = C.combine([converted if x == function_to_convert.output else x for x in root_func.outputs]) else: root_func = converted return root_func