def contractive_reward(labels, predictions_and_stop_probabilities): """ Compute the contractive reward loss in paper 'ReasoNet: Learning to Stop Reading in Machine Comprehension' Args: labels: The lables predictions_and_stop_probabilities: A list of tuples, each tuple contains the prediction and stop probability of the coresponding step. """ base = None avg_rewards = None for step in range(len(predictions_and_stop_probabilities)): pred = predictions_and_stop_probabilities[step][0] stop = predictions_and_stop_probabilities[step][1] if base is None: base = ops.element_times(pred, stop) else: base = ops.plus(ops.element_times(pred, stop), base) avg_rewards = ops.stop_gradient(sequence.reduce_sum(base * labels)) base_reward = sequence.broadcast_as(avg_rewards, base, name='base_line') # While the learner will mimize the loss by default, we want it to maxiumize the rewards # Maxium rewards => minimal -rewards # So we use (1-r/b) as the rewards instead of (r/b-1) step_cr = ops.stop_gradient(1 - ops.element_divide(labels, base_reward)) normalized_contractive_rewards = ops.element_times(base, step_cr) rewards = sequence.reduce_sum(normalized_contractive_rewards) + avg_rewards return rewards
def fully_connected_layer(input, output_dim, device_id, nonlinearity): input_dim = input.shape()[0] times_param = parameter(shape=(input_dim,output_dim)) t = times(input,times_param) plus_param = parameter(shape=(output_dim,)) p = plus(plus_param,t.output()) return nonlinearity(p.output());
def fully_connected_classifier_net(input, num_output_classes, hidden_layer_dim, num_hidden_layers, device, nonlinearity): classifier_root = fully_connected_layer(input, hidden_layer_dim, device, nonlinearity) for i in range(1, num_hidden_layers): classifier_root = fully_connected_layer(classifier_root.output(), hidden_layer_dim, device, nonlinearity) output_times_param = parameter(shape=(hidden_layer_dim,num_output_classes)) output_plus_param = parameter(shape=(num_output_classes,)) t = times(classifier_root.output(),output_times_param) classifier_root = plus(output_plus_param,t.output()) return classifier_root;
def plus(cntk_layer, inputs): ''' Setup plus op with given parameters Args: cntk_layer (:class:`~cntk.contrib.crosstalkcaffe.unimodel.cntkmodel.CntkLayersDefinition`): the layer definition of dense op inputs (list): a list contains all :class:`~cntk.ops.functions.Function` or :class:`~cntk.input` Return: :func:`~cntk.ops.functions.Function`: instaced cntk dense op ''' sanitize_left = ops.sanitize_input(inputs[0]) sanitize_right = ops.sanitize_input(inputs[1]) return ops.plus(sanitize_left, sanitize_right, name=cntk_layer.op_name)
def resnet_basic(layer_input, filter_size, num_filters, strides, prefix): """ Returns a resnet basic building block """ c1 = conv_bn_relu(layer_input, filter_size, num_filters, strides, name='{}_1'.format(prefix)) c2 = conv_bn(c1, filter_size, num_filters, strides, name='{}_2'.format(prefix)) p = plus(c2, layer_input, name='{}_res'.format(prefix)) return relu(p, name='{}_relu'.format(prefix))
def gru_cell(shape, init=glorot_uniform(), name=''): # (x, (h,c)) """ GRU cell function """ shape = _as_tuple(shape) if len(shape) != 1: raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)") # determine stacking dimensions cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 2 times # parameters Wz = Parameter(cell_shape_stacked, init=init, name='Wz') Wr = Parameter(cell_shape_stacked, init=init, name='Wr') Wh = Parameter(cell_shape_stacked, init=init, name='Wh') Uz = Parameter(_INFERRED + shape, init=init, name='Uz') Ur = Parameter(_INFERRED + shape, init=init, name='Ur') Uh = Parameter(_INFERRED + shape, init=init, name='Uh') def create_s_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return Placeholder(shape=shape, name='S') # (h, c) # parameters to model function x = Placeholder(name='gru_block_arg') prev_status = create_s_placeholder() # formula of model function Sn_1 = prev_status z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z') r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r') h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h') s = plus(element_times((1 - z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name) apply_x_s = combine([s]) apply_x_s.create_placeholder = create_s_placeholder return apply_x_s
def resnet_basic_inc(layer_input, filter_size, num_filters, strides, prefix): """ Returns a ResNet basic bulding block with projection Use when there is a change in layer_input/output channels """ ones = np.ones_like(strides) c1 = conv_bn_relu(layer_input, filter_size, num_filters, strides, name='{}_1'.format(prefix)) c2 = conv_bn(c1, filter_size, num_filters, ones, name='{}_2'.format(prefix)) s = conv_bn(layer_input, ones, num_filters, strides, name='{}_3'.format(prefix)) p = plus(c2, s, name='{}_res'.format(prefix)) return relu(p, name='{}_relu'.format(prefix))
def resnet_classifer(input, num_classes, device, output_name): conv_w_scale = 7.07 conv_b_value = 0 fc1_w_scale = 0.4 fc1_b_value = 0 sc_value = 1 bn_time_const = 4096 kernel_width = 3 kernel_height = 3 conv1_w_scale = 0.26 c_map1 = 16 conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1, 1, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_1 = resnet_node2(conv1.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_2 = resnet_node2(rn1_1.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_3 = resnet_node2(rn1_2.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) c_map2 = 32 rn2_1_wProj = get_projection_map(c_map2, c_map1, device) rn2_1 = resnet_node2_inc(rn1_3.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn2_1_wProj, device) rn2_2 = resnet_node2(rn2_1.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn2_3 = resnet_node2(rn2_2.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) c_map3 = 64 rn3_1_wProj = get_projection_map(c_map3, c_map2, device) rn3_1 = resnet_node2_inc(rn2_3.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn3_1_wProj, device) rn3_2 = resnet_node2(rn3_1.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn3_3 = resnet_node2(rn3_2.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) # Global average pooling poolw = 8 poolh = 8 poolh_stride = 1 poolv_stride = 1 pool = pooling(rn3_3.output(), AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride)) out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), device_id=device) out_bias_params = parameter(shape=(num_classes, ), device_id=device) t = times(pool.output(), out_times_params) return plus(t.output(), out_bias_params, output_name)
def create_model(): # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice( raw_labels, 1, 0, name='label_sequence') # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> # Setup primer for decoder is_first_label = sequence.is_first(label_sequence) # 1 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder stabilize = Stabilizer() encoder_output_h = stabilize(input_sequence) for i in range(0, num_layers): (encoder_output_h, encoder_output_c) = LSTM_layer(encoder_output_h.output, hidden_dim, future_value, future_value) # Prepare encoder output to be used in decoder thought_vector_h = sequence.first(encoder_output_h) thought_vector_c = sequence.first(encoder_output_c) thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h, label_sequence) thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_output_h = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hook_h = past_value recurrence_hook_c = past_value else: recurrence_hook_h = lambda operand: element_select( is_first_label, thought_vector_broadcast_h, past_value(operand) ) recurrence_hook_c = lambda operand: element_select( is_first_label, thought_vector_broadcast_c, past_value(operand) ) (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c) # Linear output layer W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform()) B = parameter(shape=(label_vocab_dim), init=0) z = plus(B, times(stabilize(decoder_output_h), W)) return z
else: recurrence_hook_h = lambda operand: element_select( is_first_label, thought_vector_broadcast_h, past_value(operand)) recurrence_hook_c = lambda operand: element_select( is_first_label, thought_vector_broadcast_c, past_value(operand)) (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c) # 1. # Add the linear layer W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform()) B = parameter(shape=(label_vocab_dim), init=0) z = plus(B, times(decoder_output_h, W)) def create_model(): # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis]