Пример #1
0
def test_slice_with_inferred_static_axis():
    x = C.input_variable(shape=(C.InferredDimension, C.InferredDimension, 3))
    padding_shape = (3, C.InferredDimension, 3)
    y = C.splice(C.constant(value=0, shape=padding_shape), x, axis=0)
    assert y.shape == (-1, -1, 3)
    y = C.splice(x, C.constant(value=0, shape=padding_shape), axis=0)
    assert y.shape == (-1, -1, 3)
Пример #2
0
def test_udf_input_values_no_sharing():
    i = C.input_variable(1, needs_gradient=True, name='i_var')
    m = C.user_function(MyArgumentPreservingPlus(i + 1, i + 2))
    
    w = C.parameter(shape=(1,), init=1)
    m = m + w
    m2 = C.splice(m, m, axis=0)
    m3 = C.splice(m2, m2, axis=0)
    m4 = C.splice(m3, m3, axis=0)

    grad_value, result = m4.grad({i : np.asarray([2], dtype=np.float32)}, outputs=[m4], wrt=[w, i])
    assert np.array_equal(result, [[8,  8,  8,  8,  8,  8,  8,  8]])
Пример #3
0
 def lstm_with_attention(dh, dc, x):
     # encoder hidden state, decoder hidden state
     tmp = encode_out.outputs[0].owner
     print(tmp)
     h_att = attention_model(encode_out.outputs[0], dh)
     x = C.splice(x, h_att)
     return rec_block(dh,dc,x)
Пример #4
0
def test_op_splice(input_data1, input_data2, axis, expected_result, device_id, precision):
    # Forward pass test
    #==================
    # We compute the expected output for the forward pass.
    # We need two surrounding brackets:
    # The first for sequences (length=1, since we have dynamic_axis='').
    # The second for batch of one sample.

    a = I([input_data1])
    b = I([input_data2])
    
    # splice using the operator
    result = C.splice((a, b), axis)

    unittest_helper(result, None, [[expected_result]], device_id=device_id, 
                precision=precision, clean_up=True, backward_pass=False)

    # Backward pass test
    # ==================
    # The gradient of the splice operator is all ones in the shape of the input

    def grad_splice(x):
        return np.ones_like(x)

    expected_gradient1 = grad_splice(np.asarray(input_data1))
    expected_gradient2 = grad_splice(np.asarray(input_data2))
    
    unittest_helper(result, None, [[expected_gradient1]], device_id = device_id,
                    precision=precision, clean_up=True, backward_pass=True, input_node=a)

    unittest_helper(result, None, [[expected_gradient2]], device_id = device_id,
                    precision=precision, clean_up=True, backward_pass=True, input_node=b)
Пример #5
0
    def test_splice(shape1, shape2):
        a = C.input_variable(shape=shape1,
                    dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
                    needs_gradient=True,
                    name='a')
        b = C.input_variable(shape=shape2,
                    dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
                    needs_gradient=True,
                    name='b')

        # create batch
        input_data1.shape = (1,) + input_data1.shape
        input_data2.shape = (1,) + input_data2.shape

        # splice using the operator
        root_op = C.splice(a, b, axis=axis, name='splice_ab')

        forward_input = {a: input_data1, b: input_data2}

        # Backward pass test
        # ==================
        # The gradient of the splice operator is all ones in the shape of the input

        def grad_splice(x):
            return np.ones_like(x)

        expected_forward = [expected_result]
        expected_backward = {
            a: grad_splice(np.asarray(input_data1)),
            b: grad_splice(np.asarray(input_data2))
        }

        unittest_helper(root_op,
                        forward_input, expected_forward, expected_backward,
                        device_id=device_id, precision=precision)
Пример #6
0
def create_model(input_dim):
    row = sequence.input_variable(shape=input_dim)
    col = sequence.input_variable(shape=input_dim)
    rowh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(row)
    colh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(col)

    x = C.splice(rowh, colh, axis=-1)
    x = lightlstm(opt.embed, opt.nhid)(x)
    x = For(range(opt.layer-1), lambda: lightlstm(opt.nhid, opt.nhid))(x)
    rowh = C.slice(x, -1, opt.nhid * 0, opt.nhid * 1)
    colh = C.slice(x, -1, opt.nhid * 1, opt.nhid * 2)

    row_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(rowh)
    col_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(colh)

    # variable : row label and col label
    row_label = sequence.input_variable(shape=input_dim)
    col_label = sequence.input_variable(shape=input_dim)
    model = C.combine([row_predict, col_predict])

    return {'row':       row,
            'col':       col,
            'row_label': row_label,
            'col_label': col_label,
            'model':     model}
Пример #7
0
    def input_layer(self,cgw,cnw,cc,qgw,qnw,qc):
        cgw_ph = C.placeholder()
        cnw_ph = C.placeholder()
        cc_ph  = C.placeholder()
        qgw_ph = C.placeholder()
        qnw_ph = C.placeholder()
        qc_ph  = C.placeholder()

        input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim))
        input_glove_words = C.placeholder(shape=(self.wg_dim,))
        input_nonglove_words = C.placeholder(shape=(self.wn_dim,))

        # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension
        # todo GlobalPooling/reduce_max should have a keepdims default to False
        embedded = C.splice(
            C.reshape(self.charcnn(input_chars), self.convs),
            self.embed()(input_glove_words, input_nonglove_words), name='splice_embed')
        processed = C.layers.Sequential([For(range(2), lambda: OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn'))])(embedded)
        
        qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
        cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
        
        q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph})
        c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph})
        return C.as_block(
            C.combine([c_processed, q_processed]),
            [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)],
            'input_layer',
            'input_layer')
Пример #8
0
def test_sequence_step_function_scalar_shape_inferrence():
    hidden_dim = 3
    in_dim = 5
    x = C.sequence.input_variable((in_dim,))
    r = C.sequence.input_variable((1,)) # value of 0/1. 0 means reset
    merged_x = C.splice(x, r) # Recurrence only takes 1 input, so concatenate the two
    cell = C.layers.LSTM(hidden_dim) # (dh, dc, x) -> (h, c)
    y = C.layers.Recurrence(cell)(x)

    @C.Function
    def lstm_with_reset(dh, dc, xr):
        xx = xr[0:-1]
        rr = xr[-1]
        return cell(rr * dh, rr * dc, xx)

    yr = C.layers.Recurrence(lstm_with_reset)(merged_x)

    seq_len = [2,3,5]
    total_len = np.sum(seq_len)
    accum_seq_len = np.cumsum(seq_len)

    x_total_data = np.random.rand(1, total_len, in_dim).astype(np.float32)
    x_data = [np.squeeze(v) for v in np.split(x_total_data, accum_seq_len[0:-1], axis=1)]

    r_data = np.ones(accum_seq_len[-1])
    for i in np.nditer(accum_seq_len[0:-1]):
        r_data[i] = 0
    r_data = np.reshape(r_data, (-1,1)).astype(np.float32)

    v1 = y.eval(x_data)
    v2 = yr.eval({x:x_total_data, r:r_data})

    assert np.allclose(np.concatenate(v1), v2[0])
Пример #9
0
def lightlstm(input_dim, cell_dim):
    x = C.placeholder(name='x')
    dh = C.placeholder(name='dh')
    dc = C.placeholder(name='dc')
    x1 = C.slice(x, -1, input_dim * 0, input_dim * 1)
    x2 = C.slice(x, -1, input_dim * 1, input_dim * 2)

    def LSTMCell(x, y, dh, dc):
        '''LightLSTM Cell'''

        b = C.parameter(shape=(4 * cell_dim), init=0)
        W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform())
        H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform())

        # projected contribution from input x, hidden, and bias
        proj4 = b + C.times(x, W) + C.times(dh, H)

        it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim)

        it = C.sigmoid(it_proj)  # input gate
        bit = it * C.tanh(bit_proj)

        ft = C.sigmoid(ft_proj)  # forget gate
        bft = ft * dc

        ct = bft + bit
        ot = C.sigmoid(ot_proj)  # output gate
        ht = ot * C.tanh(ct)

        # projected contribution from input y, hidden, and bias
        proj4_2 = b + C.times(y, W) + C.times(ht, H)

        it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim)

        it_2 = C.sigmoid(it_proj_2)  # input gate
        bit_2 = it_2 * C.tanh(bit_proj_2)

        ft_2 = C.sigmoid(ft_proj_2)  # forget gate
        bft_2 = ft_2 * ct

        ct2 = bft_2 + bit_2
        ot_2 = C.sigmoid(ot_proj_2)  # output gate
        ht2 = ot_2 * C.tanh(ct2)
        return (ht, ct, ht2, ct2)

    Cell = LSTMCell(x1, x2, dh, dc)

    actualDh = past_value(Cell[2])
    actualDc = past_value(Cell[3])

    Cell[0].replace_placeholders(
        {dh: actualDh.output, dc: actualDc.output})
    return C.splice(Cell[0], Cell[2], axis=-1)
Пример #10
0
def create_model():
    x = C.placeholder()
    with C.layers.default_options(initial_state=0.1):
        e = C.layers.Embedding(emb_dim, name='embed')(x)
        negRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(e)
        posRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(e)
        h = C.splice(posRnn, negRnn)
        out = C.layers.Dense(num_labels, name='classify')(h)
        return out
Пример #11
0
def test_Concat(tmpdir):
    data1 = np.asarray([[[1, 2], [4, 5]]], dtype=np.float32)
    x = C.constant(value=data1)
    # create 3x2 matrix in a sequence of length 1 in a batch of one sample
    data2 = np.asarray([[[10, 20], 
                         [30, 40], 
                         [50, 60]]],dtype=np.float32)
    y = C.constant(value=data2)

    # splice both inputs on axis=0 returns a 5x2 matrix
    model = C.splice(x, y, axis=1)

    verify_no_input(model, tmpdir, 'Concat_0')

    x = C.input_variable(data1.shape)

    model = C.splice(x, y, axis=1)

    verify_one_input(model, data1, tmpdir, 'Concat__1')
Пример #12
0
 def out_func1(att_input, enc_input):
     enc_input2 = enc_input
     @C.Function
     def bigru_with_match(dh, x):
         c_att = matching_model(att_input, dh)
         x = C.splice(x, c_att)
         x = C.element_times(x, C.sigmoid(C.times(x, Wg)))
         return att_gru(dh, x)
     return C.splice(C.layers.Recurrence(bigru_with_match)(enc_input2),
                 C.layers.Recurrence(bigru_with_match, go_backwards=True)(enc_input2),
                 name="bigru_with_match")
Пример #13
0
def test_clone_with_slice():
    i1 = C.input_variable((2,2), name='i1')
    i2 = C.input_variable((2,2), name='i2')
    x = C.splice(i1, i2, axis=0)
    W = C.constant(1, (4,1), name='W')
    y = C.convolution(W, x)
    assert(y.shape == (4,2))

    from ..functions import CloneMethod
    x1 = C.input_variable((2,1), name='x1')
    x2 = C.input_variable((2,1), name='x2')
    p1 = C.placeholder()
    p2 = C.placeholder()
    y_cloned = y.clone('clone', {i1:p1, i2:p2})
    y2 = y_cloned(x1, x2)
    assert(y2.shape == (4,1))
Пример #14
0
def cntk_baseline_lstm():
    import cntk as C
    import cntk.contrib.crosstalk.crosstalk_cntk as crct
    ci = crct.instance
    input_var = C.sequence.input_variable(shape=(in_dim))
    fwbw = C.splice(C.layers.Recurrence(C.layers.LSTM(dim, init_bias=C.glorot_uniform()))(input_var), C.layers.Recurrence(C.layers.LSTM(dim), go_backwards=True)(input_var))
    ci.watch(fwbw, 'birnn', var_type=cstk.RnnAttr,
          attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0))
    ci.watch(fwbw, 'birnn_out')

    data = {input_var:data_cntk}
    ci.set_data(data)
    ci.set_workdir(workdir)
    ci.fetch('birnn', save=True)
    ci.fetch('birnn_out', save=True)
    ci.reset()
Пример #15
0
def test_op_splice(input_data1, input_data2, axis, expected_result, device_id, precision):
    # FIXME This test currently fails in C++ with
    # RuntimeError: Node 'splice_ab' (RowStack operation): Attempted to
    # type-cast node to struct Microsoft::MSR::CNTK::INumInputs, which is not
    # possible.

    input_data1 = AA(input_data1, dtype=PRECISION_TO_TYPE[precision])
    input_data2 = AA(input_data2, dtype=PRECISION_TO_TYPE[precision])
    a = I(
        shape=input_data1.shape,
        data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
        needs_gradient=True,
        name="a",
    )
    b = I(
        shape=input_data2.shape,
        data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
        needs_gradient=True,
        name="b",
    )

    # create batch
    input_data1.shape = (1, 1) + input_data1.shape
    input_data2.shape = (1, 1) + input_data2.shape

    # splice using the operator
    root_op = C.splice((a, b), axis, name="splice_ab")

    forward_input = {a: input_data1, b: input_data2}

    # Backward pass test
    # ==================
    # The gradient of the splice operator is all ones in the shape of the input

    def grad_splice(x):
        return np.ones_like(x)

    expected_forward = [[expected_result]]
    expected_backward = {a: grad_splice(np.asarray(input_data1)), b: grad_splice(np.asarray(input_data2))}

    unittest_helper(
        root_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision
    )
Пример #16
0
    def gate_attention_layer(self,
                             inputs,
                             memory,
                             common_len,
                             att_kind='simi'):
        # [#,c][2*d] [#,c][*=q,1]
        if att_kind == 'dot':
            qc_attn, attn_weight = self.dot_attention(inputs, memory,
                                                      common_len).outputs
        else:
            qc_attn, attn_weight = self.simi_attention(inputs, memory).outputs
        inputs = inputs[:common_len]
        qc_attn = qc_attn[:common_len]
        cont_attn = C.splice(inputs, qc_attn)  # [#,c][4*d]

        dense = Dropout(self.dropout) >> Dense(2 * common_len,
                                               activation=C.sigmoid,
                                               input_rank=1) >> Label('gate')
        gate = dense(cont_attn)  # [#, c][4*d]
        return gate * cont_attn, attn_weight
    def greedy_model(aawk, aawn, qqwk, qqwn):
        a_oneh = C.splice(aawk, aawn)
        sentence_start = C.sequence.slice(a_oneh, 0, 1)

        @C.Function
        def process_history(hist, inp):
            wk = C.slice(hist, 0, 0, myConfig['wg_dim'])
            wn = hist[myConfig['wg_dim']:]
            hist_processed = embed_layer(wk, wn)
            out_logits = s2smodel(hist_processed, inp)
            hamax = C.reshape(C.hardmax(out_logits), (-1, ))
            return hamax

        q_processed = embed_layer(qqwk, qqwn)
        unfold = UnfoldFrom(
            lambda history: process_history(history, q_processed),
            until_predicate=lambda w: w[..., sentence_end_index],
            length_increase=1.5)
        out_onehot = unfold(sentence_start, q_processed)
        return out_onehot
Пример #18
0
    def multi_head_attention(self, contextQ, contextV, contextK, name):
        Q = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])
        V = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])
        K = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])

        att0 = self.scale_dot_product_attention_block(Q, V, K, '0')
        att1 = self.scale_dot_product_attention_block(Q, V, K, '1')
        att2 = self.scale_dot_product_attention_block(Q, V, K, '2')
        att3 = self.scale_dot_product_attention_block(Q, V, K, '3')
        att4 = self.scale_dot_product_attention_block(Q, V, K, '4')
        att5 = self.scale_dot_product_attention_block(Q, V, K, '5')

        att = C.splice(att0, att1, att2, att3, att4, att5)
        att_residual = att + Q

        return C.as_block(
            att_residual,
            [(Q, contextQ), (V, contextV), (K, contextK)],
            'multi_head_attention_layer' + name,
            'multi_head_attention_layer' + name)
Пример #19
0
    def attention_layer(self, context, query):
        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        c_processed = C.placeholder(shape=(2*self.hidden_dim,)) 
        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # This part deserves some explanation
        # It is the attention layer
        # In the paper they use a 6 * dim dimensional vector
        # here we split it in three parts because the different parts
        # participate in very different operations
        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u)
        ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform())
        ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform())
        ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times (c_processed, ws1)
        wu = C.reshape(C.times (qvw, ws2), (-1,))
        whu = C.reshape(C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1,))
        S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias
        # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S), (-1,1))
        c2q = C.reshape(C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0),(-1))
        
        max_col = C.reduce_max(S)
        c_attn = C.sequence.softmax(max_col)

        htilde = C.sequence.reduce_sum(c_processed * c_attn)
        q2c = C.sequence.broadcast_as(htilde, c_processed)
        q2c_out = c_processed * q2c

        att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out)

        return C.as_block(
            att_context,
            [(c_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
Пример #20
0
    def bi_sru_layer(self, sru_1, index):
        f_1_f = C.sigmoid(sru_1[0 * self.param2 : 1 * self.param2] + self.list_bias[0 + index * 4])
        r_1_f = C.sigmoid(sru_1[1 * self.param2 : 2 * self.param2] + self.list_bias[1 + index * 4])
        c_1_f_r = (1 - f_1_f) * sru_1[2 * self.param2: 3 * self.param2]
        dec_c_1_f = C.layers.ForwardDeclaration('f_' + str(index))
        var_c_1_f = C.sequence.delay(dec_c_1_f, initial_state=0, time_step=1)
        nex_c_1_f = var_c_1_f * f_1_f + c_1_f_r
        dec_c_1_f.resolve_to(nex_c_1_f)
        h_1_f = r_1_f * C.tanh(nex_c_1_f) + (1 - r_1_f) * sru_1[3 * self.param2 : 4 * self.param2]

        f_1_b = C.sigmoid(sru_1[4 * self.param2 : 5 * self.param2] + self.list_bias[2 + index * 4])
        r_1_b = C.sigmoid(sru_1[5 * self.param2 : 6 * self.param2] + self.list_bias[3 + index * 4])
        c_1_b_r = (1 - f_1_b) * sru_1[6 * self.param2 : 7 * self.param2]
        dec_c_1_b = C.layers.ForwardDeclaration('b_' + str(index))
        var_c_1_b = C.sequence.delay(dec_c_1_b, time_step=-1)
        nex_c_1_b = var_c_1_b * f_1_b + c_1_b_r
        dec_c_1_b.resolve_to(nex_c_1_b)
        h_1_b = r_1_b * C.tanh(nex_c_1_b) + (1 - r_1_b) * sru_1[7 * self.param2 : 8 * self.param2]

        x = C.splice(h_1_f, h_1_b)
        return x
Пример #21
0
    def test_splice(shape1, shape2):
        a = C.input_variable(shape=shape1,
                             dtype=sanitize_dtype_cntk(
                                 PRECISION_TO_TYPE[precision]),
                             needs_gradient=True,
                             name='a')
        b = C.input_variable(shape=shape2,
                             dtype=sanitize_dtype_cntk(
                                 PRECISION_TO_TYPE[precision]),
                             needs_gradient=True,
                             name='b')

        # create batch
        input_data1.shape = (1, ) + input_data1.shape
        input_data2.shape = (1, ) + input_data2.shape

        # splice using the operator
        root_op = C.splice(a, b, axis=axis, name='splice_ab')

        forward_input = {a: input_data1, b: input_data2}

        # Backward pass test
        # ==================
        # The gradient of the splice operator is all ones in the shape of the input

        def grad_splice(x):
            return np.ones_like(x)

        expected_forward = [expected_result]
        expected_backward = {
            a: grad_splice(np.asarray(input_data1)),
            b: grad_splice(np.asarray(input_data2))
        }

        unittest_helper(root_op,
                        forward_input,
                        expected_forward,
                        expected_backward,
                        device_id=device_id,
                        precision=precision)
Пример #22
0
  def bilateral_slice(im, guide, guide_no_grad):
    # Flatten data for gather op
    flat_grid = grid_scale*C.reshape(grid, [grid_sz*grid_sz*sigma_r*o_chans*(i_chans+1)])
    # flat_grid_u = C.unpack_batch(flat_grid)

    # Make sure we do sth that requires the gradient w.r.t guide
    scaled_guide = guide_scale*guide  
    gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord(
        scaled_guide, xx, yy, sz, grid_sz, sigma_r)
    wx = C.abs(gx_d - 0.5 - fx_d)
    wy = C.abs(gy_d - 0.5 - fy_d)
    wz = C.abs(gz_d - 0.5 - fz_d)

    # Enclosing cell
    gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord(
        guide_no_grad, xx, yy, sz, grid_sz, sigma_r)

    out_chans = []
    for chan in range(o_chans):
      output_components = []
      for ix, x in enumerate([fx, cx]):
        wx_ = (1-wx) if ix == 0 else wx
        for iy, y in enumerate([fy, cy]):
          wy_ = (1-wy) if iy == 0 else wy
          for iz, z in enumerate([fz, cz]):
            wz_ = (1-wz) if iz == 0 else wz

            linear_idx = x + grid_sz*(y + grid_sz*(z + sigma_r*(cc + chan*(i_chans+1))))
            flat_linear_idx = C.reshape(linear_idx, [(i_chans+1)*sz*sz])
            # Slice
            interp = C.gather(flat_grid, flat_linear_idx)
            interp_fsz = C.reshape(interp, [i_chans+1, sz, sz])*wx_*wy_*wz_
            output_components.append(interp_fsz)

      out_coeffs = sum(output_components)
      out_chan = C.reduce_sum(out_coeffs[:i_chans]*(im_scale*im) + out_coeffs[-1], 0)
      out_chans.append(out_chan)
    out = C.splice(*out_chans, axis=0)

    return out
Пример #23
0
 def multiHead(self, context, query, outdim, head=4):
     cph = C.placeholder()
     qph = C.placeholder()
     atts = []
     for i in range(head):
         dense_q = C.layers.Dense(outdim,
                                  activation=C.relu,
                                  init=xavier(1.377),
                                  bias=False,
                                  input_rank=1,
                                  name='headq_{}'.format(i))(qph)
         dense_c = C.layers.Dense(outdim,
                                  activation=C.relu,
                                  init=xavier(1.377),
                                  bias=False,
                                  input_rank=1,
                                  name='headc_{}'.format(i))(cph)
         attn, _ = self.dot_attention(dense_c, dense_q, outdim).outputs
         atts.append(attn)
     res = C.splice(*atts)
     return C.as_block(res, [(cph, context), (qph, query)], 'multiHead',
                       'multiHead')
Пример #24
0
def create_model(input, num_classes):
    c_map = [16, 32, 64]
    num_stack_layers = 3

    conv = conv_bn_relu(input, (3,3), c_map[0])
    r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0])

    r2_1 = resnet_basic_inc(r1, c_map[1])
    r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1])

    r3_1 = resnet_basic_inc(r2_2, c_map[2])
    r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2])

    up_r1 = OneByOneConvAndUpSample(r1, 0, num_classes)
    up_r2_2 = OneByOneConvAndUpSample(r2_2, 1, num_classes)
    up_r3_2 = OneByOneConvAndUpSample(r3_2, 2, num_classes)

    merged = C.splice(up_r1, up_r3_2, up_r2_2, axis=0)

    resnet_fcn_out = Convolution((1, 1), num_classes, init=he_normal(), activation=sigmoid, pad=True)(merged)

    return resnet_fcn_out
Пример #25
0
    def input_layer(self,cgw,cc,qgw,qc,qnw,cnw):
        cgw_ph = C.placeholder()
        cnw_ph = C.placeholder()
        cc_ph  = C.placeholder()
        qgw_ph = C.placeholder()
        qnw_ph = C.placeholder()
        qc_ph  = C.placeholder()

        input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim))
        input_glove_words = C.placeholder(shape=(self.wg_dim,))
        input_nonglove_words = C.placeholder(shape=(self.wn_dim,))

        embedded = C.splice(
            C.reshape(self.charcnn(input_chars), self.convs),
            self.embed()(input_glove_words, input_nonglove_words), name='splice_embed')

        highway = HighwayNetwork(dim=self.elmo_dim + self.hidden_dim + self.convs, 
                                 highway_layers=self.highway_layers)(embedded)
        highway_drop = C.layers.Dropout(self.dropout)(highway)
        processed = OptimizedRnnStack(self.hidden_dim,
             num_layers=1,
             bidirectional=True,
             use_cudnn=self.use_cudnn,
             name='input_rnn')(highway_drop)
        
        qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
        cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
                
        q_processed = processed.clone(C.CloneMethod.share, 
            {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph})
        c_processed = processed.clone(C.CloneMethod.share, 
            {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph})

        return C.as_block(
            C.combine([c_processed, q_processed]),
            [(cgw_ph, cgw), (cc_ph, cc), (qgw_ph, qgw), (qc_ph, qc), (qnw_ph, qnw), (cnw_ph, cnw)],
            'input_layer',
            'input_layer')
Пример #26
0
def create_model_cnn_body():

    with C.layers.default_options(initial_state=0.1):

       

        h1t= C.layers.Embedding(300,name='embed')(xb)#init=embedding,
        h1b= C.layers.Embedding(300,name='embed')(xt)#init=embedding,
        bnb = C.layers.BatchNormalization(name='bn')(h1b)
        bnt = C.layers.BatchNormalization(name='bn')(h1t)
        to_static_t= C.layers.PastValueWindow(window_size=max_length_title, axis=-2)(bnt)[0]
        to_static_b= C.layers.PastValueWindow(window_size=max_length_body, axis=-2)(bnb)[0]


        h2_1t=C.layers.Convolution((1,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t)
        h2_2t=C.layers.Convolution((2,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t)
        h2_3t=C.layers.Convolution((3,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t)
        
        h2_1b=C.layers.Convolution((1,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b)
        h2_2b=C.layers.Convolution((2,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b)
        h2_3b=C.layers.Convolution((3,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b)

        h3_2t=C.layers.MaxPooling((max_length-1,1),name='pooling')(h2_2t)
        h3_1t=C.layers.MaxPooling((max_length,1),name='pooling')(h2_1t)
        h3_3t=C.layers.MaxPooling((max_length-2,1),name='pooling')(h2_3t)
        
        h3_2b=C.layers.MaxPooling((max_length-1,1),name='pooling')(h2_2b)
        h3_1b=C.layers.MaxPooling((max_length,1),name='pooling')(h2_1b)
        h3_3b=C.layers.MaxPooling((max_length-2,1),name='pooling')(h2_3b)

        h3=C.splice(h3_2t,h3_1t,h3_3t,h3_2b,h3_1b,h3_3b,axis=0)
        
        h4=C.layers.Dense(hidden_dim, activation=C.relu,name='hidden')(h3)
        drop2 = C.layers.Dropout(0.5)(h4)

        h5=C.layers.Dense(num_labels,name='classify')(drop2)

    return h5
Пример #27
0
def test_large_model_serialization_float(tmpdir):
    import os
    from cntk.layers import Recurrence, LSTM, Dense

    type_size = np.dtype(np.float32).itemsize
    two_gb = 2**31
    size = (2097152 + 4, 256, 512, 4096)
    assert size[0] * size[1] * type_size > two_gb

    device = C.device.cpu()
    i = C.sequence.input(size[0])
    w = C.Parameter((size[0], size[1]),
                    init=C.uniform(3.0, seed=12345),
                    device=device)
    e = C.times(i, w)

    h_fwd = Recurrence(LSTM(size[2]))(e)
    h_bwd = Recurrence(LSTM(size[2]), go_backwards=True)(e)
    h_last_fwd = C.sequence.last(h_fwd)
    h_first_bwd = C.sequence.first(h_bwd)
    t = C.splice(h_last_fwd, h_first_bwd)

    z1 = Dense(size[2], activation=C.relu)(t)
    z = Dense(2, activation=None)(z1)

    filename = str(tmpdir / 'test_large_model_serialization_float.out')
    z.save(filename)

    assert os.path.getsize(filename) > two_gb

    y = C.Function.load(filename, device=device)

    assert (len(z.parameters) == len(y.parameters))

    for param_pair in zip(z.parameters, y.parameters):
        assert param_pair[0].shape == param_pair[1].shape
        assert np.allclose(param_pair[0].value, param_pair[1].value)
Пример #28
0
def test_sequence_step_function_scalar_shape_inferrence():
    hidden_dim = 3
    in_dim = 5
    x = C.sequence.input_variable((in_dim, ))
    r = C.sequence.input_variable((1, ))  # value of 0/1. 0 means reset
    merged_x = C.splice(
        x, r)  # Recurrence only takes 1 input, so concatenate the two
    cell = C.layers.LSTM(hidden_dim)  # (dh, dc, x) -> (h, c)
    y = C.layers.Recurrence(cell)(x)

    @C.Function
    def lstm_with_reset(dh, dc, xr):
        xx = xr[0:-1]
        rr = xr[-1]
        return cell(rr * dh, rr * dc, xx)

    yr = C.layers.Recurrence(lstm_with_reset)(merged_x)

    seq_len = [2, 3, 5]
    total_len = np.sum(seq_len)
    accum_seq_len = np.cumsum(seq_len)

    x_total_data = np.random.rand(1, total_len, in_dim).astype(np.float32)
    x_data = [
        np.squeeze(v)
        for v in np.split(x_total_data, accum_seq_len[0:-1], axis=1)
    ]

    r_data = np.ones(accum_seq_len[-1])
    for i in np.nditer(accum_seq_len[0:-1]):
        r_data[i] = 0
    r_data = np.reshape(r_data, (-1, 1)).astype(np.float32)

    v1 = y.eval(x_data)
    v2 = yr.eval({x: x_total_data, r: r_data})

    assert np.allclose(np.concatenate(v1), v2[0])
Пример #29
0
    def indy_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + C.splice(
            dhs, dhs, dhs, dhs) * H1  # 4 is the number of stacked dim

        it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                        1 * stacked_dim)  # split along stack_axis
        bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)
        ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c
Пример #30
0
def test_large_model_serialization_float(tmpdir):
    import os; 
    from cntk.layers import Recurrence, LSTM, Dense

    type_size = np.dtype(np.float32).itemsize
    two_gb = 2**31
    size = (2097152 + 4, 256, 512, 4096)
    assert size[0] * size[1] * type_size > two_gb

    device = C.device.cpu()
    i = C.sequence.input(size[0])
    w = C.Parameter((size[0], size[1]), init=C.uniform(3.0, seed=12345),
        device=device)
    e = C.times(i, w)
                                    
    h_fwd = Recurrence(LSTM(size[2]))(e)
    h_bwd = Recurrence(LSTM(size[2]), go_backwards=True)(e)
    h_last_fwd = C.sequence.last(h_fwd)
    h_first_bwd = C.sequence.first(h_bwd)
    t = C.splice(h_last_fwd, h_first_bwd)

    z1 = Dense(size[2], activation=C.relu)(t)     
    z = Dense(2, activation=None)(z1)  

    filename = str(tmpdir / 'test_large_model_serialization_float.out')
    z.save(filename)

    assert os.path.getsize(filename) > two_gb

    y = C.Function.load(filename, device=device)

    assert (len(z.parameters) == len(y.parameters))

    for param_pair in zip(z.parameters, y.parameters):
        assert param_pair[0].shape == param_pair[1].shape
        assert np.allclose(param_pair[0].value, param_pair[1].value)
Пример #31
0
    def output_layer(self, embed, attention_context, model_context, aw, q_processed, c_processed,cw):
        cw_ph=C.placeholder()
        att_context = C.placeholder(shape=(8*self.hidden_dim,))
        query_processed = C.placeholder(shape=(2*self.hidden_dim,))
        context_processed = C.placeholder(shape=(2*self.hidden_dim,))
        mod_context = C.placeholder(shape=(2*self.hidden_dim))
        a_onehot = C.placeholder(shape=(self.vocab_size+1,))

        start_logits = C.layers.Dense(1, name='out_start')(C.dropout(C.splice(mod_context, att_context), self.dropout))
        start_hardmax = seq_hardmax(start_logits)
        att_mod_ctx = C.sequence.last(C.sequence.gather(mod_context, start_hardmax))
        att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context)
        end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded)
        m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input)
        end_logits = C.layers.Dense(1, name='out_end')(C.dropout(C.splice(m2, att_context), self.dropout))

        start_flag = C.hardmax(start_logits)
        end_flag = C.hardmax(end_logits)
     
        def create_model():
            # Encoder: (input*) --> (h0, c0)
            # Create multiple layers of LSTMs by passing the output of the i-th layer
            # to the (i+1)th layer as its input
            with C.layers.default_options(enable_self_stabilization=True, go_backwards=False):
                LastRecurrence = C.layers.Recurrence
                encode = C.layers.Sequential([
                    C.layers.Stabilizer(),
                    OptimizedRnnStack(self.hidden_dim, return_full_state=True),
                ])

                encode_c = C.layers.Sequential([
                    C.layers.Stabilizer(),
                    OptimizedRnnStack(self.hidden_dim, return_full_state=True),
                ])
            
            # Decoder: (history*, input*) --> unnormalized_word_logp*
            # where history is one of these, delayed by 1 step and <s> prepended:
            #  - training: labels
            #  - testing:  its own output hardmax(z) (greedy decoder)
            with C.layers.default_options(enable_self_stabilization=True):
                # sub-layers
                stab_in = C.layers.Stabilizer()
                rec_blocks = [C.layers.LSTM(self.hidden_dim) for i in range(self.num_layers)]
                stab_out = C.layers.Stabilizer()
                proj_out = C.layers.Dense(self.vocab_size+1, name='out_proj')
                # attention model
                attention_model = C.layers.AttentionModel(self.attention_dim, 
                                                              name='attention_model') # :: (h_enc*, h_dec) -> (h_dec augmented)
                hstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1)
                cstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1)
                W_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                U_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                V_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                maxout  = C.layers.MaxPooling((2,), strides=2)
                # layer function
                @C.Function
                def decode(history, q, c, start_logits, end_logits):
                    q = encode(q)
                    c = encode_c(C.splice(c, start_logits, end_logits, axis=0))
                    r = history
                    r = stab_in(r)

                    q_last_h = C.sequence.last(q.outputs[0])
                    q_last_c = C.sequence.last(q.outputs[1])
                    c_last_h = C.sequence.last(c.outputs[0])
                    c_last_c = C.sequence.last(c.outputs[1])
                    initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h))
                    initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c))

                    rec_block = rec_blocks[0]   # LSTM(hidden_dim)  # :: (dh, dc, x) -> (h, c)
                    
                    @C.Function
                    def find_embed(x):
                        gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size)
                        return embed(gx, ngx) 

                    @C.Function
                    def lstm_with_attention(dh, dc, r, x):
                        history_embed = find_embed(x)
                        h_att = attention_model(c.outputs[0], dh)
                        q_att = attention_model(q.outputs[0], dh)
                        att = C.splice(h_att, q_att)
                        x = C.splice(x, att)
                        x, dc = rec_block(dh, dc, x).outputs
          
                        # 0*r is a hack because cntk freaks out when r is not used.
                        r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r 
                        #bug when W_dense is added first, wtf?!
                        #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r
                        return x, dc, r
                    _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs
        
                    r = maxout(r)
                    r = stab_out(r)
                    r = proj_out(r)
                    #r = C.softmax(r)
                    r = C.layers.Label('out_proj_out')(r)
                    return r
            return decode

        def create_model_train(s2smodel):
            # model used in training (history is known from labels)
            # note: the labels must NOT contain the initial <s>
            @C.Function
            def model_train(labels, q, c, start_logits, end_logits): # (input*, labels*) --> (word_logp*)

                # The input to the decoder always starts with the special label sequence start token.
                # Then, use the previous value of the label sequence (for training) or the output (for execution).
                past_labels = C.layers.Delay(initial_state=self.sentence_start)(labels)
    
                return s2smodel(past_labels, q, c, start_logits, end_logits)
            return model_train

        def create_model_greedy(s2smodel):
            # model used in (greedy) decoding (inferencing) (history is decoder's own output)
            @C.Function
            def model_greedy(q, c, start_logits, end_logits): # (input*) --> (word_sequence*)
                # Decoding is an unfold() operation starting from sentence_start.
                # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*)
                # which holds 'input' in its closure.
                unfold = C.layers.UnfoldFrom(\
                                    lambda history: s2smodel(history, q, c, start_logits, end_logits) >> C.hardmax,
                                    # stop once sentence_end_index was max-scoring output
                                    until_predicate=lambda w: w[...,self.sentence_end_index],
                                    length_increase=self.sentence_max_length)
                return unfold(initial_state=self.sentence_start, dynamic_axes_like=c)
            return model_greedy
       
        s2smodel = create_model()
      
        model_train = create_model_train(s2smodel)(a_onehot, query_processed, context_processed, start_logits, end_logits)
        model_greed = create_model_greedy(s2smodel)(query_processed, context_processed, start_logits, end_logits)
        model_greedy = C.argmax(model_greed,0)
        context = C.argmax(cw_ph,0)

        return C.as_block(
            C.combine((model_train, model_greedy, start_logits, end_logits,context)),
            [(att_context, attention_context), (mod_context, model_context), (a_onehot, aw), (query_processed, q_processed), (context_processed, c_processed),(cw_ph,cw)],
            'attention_layer',
            'attention_layer')
Пример #32
0
def validate_model(test_data, model, polymath):
    begin_logits = model.outputs[0]
    end_logits   = model.outputs[1]
    loss         = model.outputs[2]
    root = C.as_composite(loss.owner)
    mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False)
    begin_label = argument_by_name(root, 'ab')
    end_label   = argument_by_name(root, 'ae')

    begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True)
    end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True)

    best_span_score = symbolic_best_span(begin_prediction, end_prediction)
    predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction))
    true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label))
    common_span = C.element_min(predicted_span, true_span)
    begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label))
    end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label))

    predicted_len = C.sequence.reduce_sum(predicted_span)
    true_len = C.sequence.reduce_sum(true_span)
    common_len = C.sequence.reduce_sum(common_span)
    f1 = 2*common_len/(predicted_len+true_len)
    exact_match = C.element_min(begin_match, end_match)
    precision = common_len/predicted_len
    recall = common_len/true_len
    overlap = C.greater(common_len, 0)
    s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes())
    stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match))

    # Evaluation parameters
    minibatch_size = 20000
    num_sequences = 0

    stat_sum = 0
    loss_sum = 0

    while True:
        data = mb_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data or not (begin_label in data) or data[begin_label].num_sequences == 0:
            break
        out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False)
        testloss = out[loss]
        g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False)
        other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]}
        stat_sum += stats.eval((other_input_map))
        loss_sum += np.sum(testloss.asarray())
        num_sequences += data[begin_label].num_sequences

    stat_avg = stat_sum / num_sequences
    loss_avg = loss_sum / num_sequences

    print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format(
            num_sequences,
            loss_avg,
            stat_avg[0],
            stat_avg[1],
            stat_avg[2],
            stat_avg[3],
            stat_avg[4],
            stat_avg[5],
            stat_avg[6]))

    return loss_avg
Пример #33
0
def LookAhead(x):
    xn = C.sequence.future_value(x)
    return C.splice(x,xn)
Пример #34
0
 def func(x):
     return C.splice(
                 C.layers.Recurrence(C.layers.GRU(hidden_dim))(x),
                 C.layers.Recurrence(C.layers.GRU(hidden_dim), go_backwards=True)(x),
                 name=name)
Пример #35
0
def BiRecurrence(fwd, bwd):
    F = C.layers.Recurrence(fwd)
    G = C.layers.Recurrence(bwd, go_backwards=True)
    x = C.placeholder()
    apply_x = C.splice(F(x), G(x))  # concatenate the tensors
    return apply_x
Пример #36
0
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False):
    chunk = {}
    log_det_J = 0

    chunk['input_dim'] = input_dim
    _ph = C.placeholder(input_dim, name='place_holder')
    _out = _ph

    if batch_norm:
        # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph)
        # chunk['scale'] = _bn.parameters[0]
        # chunk['bias'] = _bn.parameters[1]

        chunk['mu'] = C.Constant(np.zeros(shape=input_dim))
        chunk['var'] = C.Constant(np.ones(shape=input_dim))

        _eps = C.Constant(1e-7)
        _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis())
        _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis())

        chunk['muB'] = _mu
        chunk['varB'] = _var

        # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps)
        _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu']
        _ph = _bn

        log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps)))
        # log_det_J += C.reduce_sum(C.log())

    chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim))
    _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim)
    # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0)
    _out = _ph@_W
    log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1]
    
    _half_dim = input_dim//2
    _x1 = _out[:_half_dim]
    _x2 = _out[_half_dim:]

    _log_s_func, _t_func = act_func_pair
    if _log_s_func is None: # basic network
        _log_s_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim, C.tanh),
        ])#(C.placeholder(input_dim, name='place_holder'))
    if _t_func is None: # basic network
        _t_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim),
        ])#(C.placeholder(input_dim, name='place_holder'))

    chunk['log_s_func'] = _log_s_func
    chunk['t_func'] = _t_func

    _log_s, _t = _log_s_func(_x2), _t_func(_x2)

    _s = C.exp(_log_s)

    _y1 = _s*_x1 + _t
    _y2 = _x2

    _Y = C.splice(_y1, _y2)
    chunk['output'] = _Y

    log_det_J += C.reduce_sum(_log_s)

    return _Y, log_det_J, chunk
Пример #37
0
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         (image_widht, image_height, image_scale) as CNTK variable or constant
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=0.1)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(np.prod(rpn_cls_score.shape) / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions))
    rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax")
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape)

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # For loss functions: ignore label predictions for the 'ignore label',
        # i.e. set target and prediction to 0 --> needs to be softmaxed before
        rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions))
        ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1))
        rpn_cls_prob_ignore = ignore.outputs[0]
        fg_targets = ignore.outputs[1]
        bg_targets = 1 - fg_targets
        rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0)

        # RPN losses
        rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0)
        rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights))
        rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses")

    return rpn_rois, rpn_losses
Пример #38
0
def UpSampling2D(x):
    xr = c.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2], 1))
    xx = c.splice(xr, xr, axis=-1)
    xy = c.splice(xx, xx, axis=-3)
    result = c.reshape(xy, (x.shape[0], x.shape[1] * 2, x.shape[2] * 2))
    return result
def validate_model(test_data, model, polymath):
    begin_logits = model.outputs[0]
    end_logits = model.outputs[1]
    loss = model.outputs[2]
    root = C.as_composite(loss.owner)
    mb_source, input_map = create_mb_and_map(root,
                                             test_data,
                                             polymath,
                                             randomize=False,
                                             repeat=False)
    begin_label = argument_by_name(root, 'ab')
    end_label = argument_by_name(root, 'ae')

    begin_prediction = C.sequence.input_variable(
        1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True)
    end_prediction = C.sequence.input_variable(
        1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True)

    best_span_score = symbolic_best_span(begin_prediction, end_prediction)
    predicted_span = C.layers.Recurrence(
        C.plus)(begin_prediction - C.sequence.past_value(end_prediction))
    true_span = C.layers.Recurrence(C.plus)(begin_label -
                                            C.sequence.past_value(end_label))
    common_span = C.element_min(predicted_span, true_span)
    begin_match = C.sequence.reduce_sum(
        C.element_min(begin_prediction, begin_label))
    end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label))

    predicted_len = C.sequence.reduce_sum(predicted_span)
    true_len = C.sequence.reduce_sum(true_span)
    common_len = C.sequence.reduce_sum(common_span)
    f1 = 2 * common_len / (predicted_len + true_len)
    exact_match = C.element_min(begin_match, end_match)
    precision = common_len / predicted_len
    recall = common_len / true_len
    overlap = C.greater(common_len, 0)
    s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes())
    stats = C.splice(s(f1), s(exact_match), s(precision), s(recall),
                     s(overlap), s(begin_match), s(end_match))

    # Evaluation parameters
    minibatch_size = 2048
    num_sequences = 0

    stat_sum = 0
    loss_sum = 0

    with tqdm(ncols=32) as progress_bar:
        while True:
            data = mb_source.next_minibatch(minibatch_size,
                                            input_map=input_map)
            if not data or not (begin_label in data
                                ) or data[begin_label].num_sequences == 0:
                break
            out = model.eval(data,
                             outputs=[begin_logits, end_logits, loss],
                             as_numpy=False)
            testloss = out[loss]
            g = best_span_score.grad(
                {
                    begin_prediction: out[begin_logits],
                    end_prediction: out[end_logits]
                },
                wrt=[begin_prediction, end_prediction],
                as_numpy=False)
            other_input_map = {
                begin_prediction: g[begin_prediction],
                end_prediction: g[end_prediction],
                begin_label: data[begin_label],
                end_label: data[end_label]
            }
            stat_sum += stats.eval((other_input_map))
            loss_sum += np.sum(testloss.asarray())
            num_sequences += data[begin_label].num_sequences
            progress_bar.update(data[begin_label].num_sequences)

    stat_avg = stat_sum / num_sequences
    loss_avg = loss_sum / num_sequences

    print(
        "\nValidated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}"
        .format(num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2],
                stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6]))

    return loss_avg
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None, conv_bias_init=0.0):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["CNTK"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss,
                                         [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                         'CE_with_ignore', 'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss,
                                          [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets),
                                           (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
                                          'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses")

    return rpn_rois, rpn_losses
Пример #41
0
 def func(x):
     return C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_dim))(x),
                     C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                         go_backwards=True)(x),
                     name=name)
Пример #42
0
        def lstm_w_attention(h, c, x):
            # alias is used to work around bug when arguments in block funcion are the same
            attended = mha(h, encoded, C.alias(encoded))

            xx = C.splice(attended, x)
            return lstm(h, c, xx)
Пример #43
0
print('Writing train text file...')
savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train)

print('Writing test text file...')
savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test)

print('Done')

input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)
normalize_input = input / 255.0
squared_input = C.square(input / 255.0)
sqrt_input = C.sqrt(input / 255.0)

z = create_model(C.splice(normalize_input, squared_input, sqrt_input))

loss = C.cross_entropy_with_softmax(z, label)

label_error = C.classification_error(z, label)

lr_schedule = C.learning_parameter_schedule(learning_rate)

learner = C.sgd(z.parameters, lr_schedule)

trainer = C.Trainer(z, (loss, label_error), [learner])

data_found = False

for data_dir in [
        os.path.join("..", "Examples", "Image", "DataSets", "MNIST"),
Пример #44
0
    def attention_layer(self, context, query, layer):

        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))
        p_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                         init=C.glorot_uniform())
        wp = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                         init=C.glorot_uniform())
        wg = C.parameter(shape=(8 * self.hidden_dim, 8 * self.hidden_dim),
                         init=C.glorot_uniform())
        v = C.parameter(shape=(2 * self.hidden_dim, 1),
                        init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2 * self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2 * self.hidden_dim))

        # seq[tensor[q_len]]
        S = C.reshape(
            C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt),
                    v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))

        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(
            C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1),
            (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)

        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))

        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat

        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim,
                              bidirectional=True,
                              use_cudnn=self.use_cudnn,
                              name=layer + '_attention_rnn')
        ])(uc_concat_star)

        return C.as_block(vp, [(p_processed, context), (q_processed, query)],
                          'attention_layer', 'attention_layer')
Пример #45
0
 def BiRnn(e):
     h = C.splice(posRnn(e), negRnn(e), name=name)
     return h
Пример #46
0
 def createReadOutNetwork(self, decoderHidden, preTrgEmb):
     readOut = C.splice(decoderHidden, preTrgEmb, axis=-1)
     preSoftmax = C.times(readOut, self.Wt) + self.Wtb
     return preSoftmax
Пример #47
0
    def build_model(self):
        c = C.Axis.new_unique_dynamic_axis('c')
        q = C.Axis.new_unique_dynamic_axis('q')
        b = C.Axis.default_batch_axis()
        cgw = C.input_variable(self.wg_dim,
                               dynamic_axes=[b, c],
                               is_sparse=self.use_sparse,
                               name='cgw')
        cnw = C.input_variable(self.wn_dim,
                               dynamic_axes=[b, c],
                               is_sparse=self.use_sparse,
                               name='cnw')
        qgw = C.input_variable(self.wg_dim,
                               dynamic_axes=[b, q],
                               is_sparse=self.use_sparse,
                               name='qgw')
        qnw = C.input_variable(self.wn_dim,
                               dynamic_axes=[b, q],
                               is_sparse=self.use_sparse,
                               name='qnw')
        cc = C.input_variable((1, self.word_size),
                              dynamic_axes=[b, c],
                              name='cc')
        qc = C.input_variable((1, self.word_size),
                              dynamic_axes=[b, q],
                              name='qc')
        ab = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ab')
        ae = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ae')
        input_phs = {
            'cgw': cgw,
            'cnw': cnw,
            'qgw': qgw,
            'qnw': qnw,
            'cc': cc,
            'qc': qc,
            'ab': ab,
            'ae': ae
        }
        self._input_phs = input_phs
        seif.info['query'] = C.splice(qgw, qnw)
        self.info['doc'] = C.splice(cgw, gnw)
        # graph
        pu, qu = self.input_layer(cgw, cnw, cc, qgw, qnw, qc).outputs
        gate_pu, wei1 = self.gate_attention_layer(
            pu,
            qu,
            common_len=2 * self.hidden_dim,
            attn_kind=self.attn_configs[0])  # [#,c][4*hidden]
        self.info['attn1'] = wei1 * 1.0
        print('[RNet build]gate_pu:{}'.format(gate_pu))
        pv = self.reasoning_layer(gate_pu)  # [#,c][2*hidden]
        gate_self, wei2 = self.gate_attention_layer(
            pv,
            pv,
            common_len=2 * self.hidden_dim,
            att_kind=self.attn_configs[1])  # [#,c][4*hidden]
        self.info['attn2'] = wei2 * 1.0
        ph = self.reasoning_layer(gate_self)  # [#,c][2*hidden]
        init_pu = self.weighted_sum(pu)

        start_logits, end_logits = self.output_layer(
            init_pu.outputs[0], ph, 2 * self.hidden_dim)  # [#, c][1]

        # loss
        start_loss = seq_loss(start_logits, ab)
        end_loss = seq_loss(end_logits, ae)
        # paper_loss = start_loss + end_loss
        new_loss = all_spans_loss(start_logits, ab, end_logits, ae)
        self._model = C.combine([start_logits, end_logits])
        self._loss = new_loss
        return self._model, self._loss, self._input_phs
Пример #48
0
 def bigru_with_match(dh, x):
     c_att = matching_model(att_input, dh)
     x = C.splice(x, c_att)
     x = C.element_times(x, C.sigmoid(C.times(x, Wg)))
     return att_gru(dh, x)
Пример #49
0
    def build_model(self):
        c = C.Axis.new_unique_dynamic_axis('c')
        q = C.Axis.new_unique_dynamic_axis('q')
        b = C.Axis.default_batch_axis()
        cgw = C.input_variable(self.wg_dim,
                               dynamic_axes=[b, c],
                               is_sparse=self.use_sparse,
                               name='cgw')
        cnw = C.input_variable(self.wn_dim,
                               dynamic_axes=[b, c],
                               is_sparse=self.use_sparse,
                               name='cnw')
        qgw = C.input_variable(self.wg_dim,
                               dynamic_axes=[b, q],
                               is_sparse=self.use_sparse,
                               name='qgw')
        qnw = C.input_variable(self.wn_dim,
                               dynamic_axes=[b, q],
                               is_sparse=self.use_sparse,
                               name='qnw')
        cc = C.input_variable((1, self.word_size),
                              dynamic_axes=[b, c],
                              name='cc')
        qc = C.input_variable((1, self.word_size),
                              dynamic_axes=[b, q],
                              name='qc')
        ab = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ab')
        ae = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ae')
        qf = C.input_variable(1,
                              dynamic_axes=[b, q],
                              is_sparse=False,
                              name='query_feature')
        df = C.input_variable(3,
                              dynamic_axes=[b, c],
                              is_sparse=False,
                              name='doc_feature')
        input_phs = {
            'cgw': cgw,
            'cnw': cnw,
            'qgw': qgw,
            'qnw': qnw,
            'cc': cc,
            'qc': qc,
            'ab': ab,
            'ae': ae,
            'qf': qf,
            'df': df
        }
        self._input_phs = input_phs
        self.info['query'] = C.splice(qgw, qnw)
        self.info['doc'] = C.splice(cgw, cnw)
        # graph
        elmo_encoder = self.__elmo_fac.build()
        #input layer
        reduction_cc = C.reshape(cc, (-1, ))
        reduction_qc = C.reshape(qc, (-1, ))
        c_elmo = elmo_encoder(reduction_cc)
        q_elmo = elmo_encoder(reduction_qc)
        pu, qu = self.input_layer(cgw, cnw, qgw, qnw).outputs
        enhance_pu = C.splice(pu, c_elmo, df)
        enhance_qu = C.splice(qu, q_elmo, qf)
        gate_pu, wei1 = self.gate_attention_layer(enhance_pu, enhance_qu, common_len=2*self.hidden_dim+1024,\
            att_kind=self.attn_configs[0]) # [#,c][4*hidden]
        self.info['attn1'] = 1.0 * wei1
        pv = self.reasoning_layer(gate_pu)  # [#,c][2*hidden]
        # self attention
        gate_self, wei2 = self.gate_attention_layer(
            pv,
            pv,
            common_len=2 * self.hidden_dim,
            att_kind=self.attn_configs[1])  # [#,c][4*hidden]
        self.info['attn2'] = 1.0 * wei2
        ph = self.reasoning_layer(gate_self)  # [#,c][2*hidden]
        enhance_ph = C.splice(ph, c_elmo, df)
        init_pu = self.weighted_sum(enhance_pu)

        start_logits, end_logits = self.output_layer(
            init_pu.outputs[0], enhance_ph,
            2 * self.hidden_dim + 1027)  # [#, c][1]
        self.info['start_logits'] = start_logits * 1.0
        self.info['end_logits'] = end_logits * 1.0

        # loss
        start_loss = seq_loss(start_logits, ab)
        end_loss = seq_loss(end_logits, ae)
        # paper_loss = start_loss + end_loss
        new_loss = all_spans_loss(start_logits, ab, end_logits, ae)
        self._model = C.combine([start_logits, end_logits])
        self._loss = new_loss
        return self._model, self._loss, self._input_phs
def _from_optimized_rnnstack(cudnn_rnn):
    '''
    converts cudnn optimized_rnnstack to non-cudnn functions to run in non-CUDA environment
    
    Args:
        cudnn_rnn: the optimized_rnnstack function that contains the parameters to be converted
    Returns:
        converted rnn function on GEMM based implementation that can be used on CPU
    '''

    if cudnn_rnn.root_function.op_name != 'OptimizedRNNStack':
        raise ValueError('unexpected cudnn_rnn.root_function.op_name value "%s"'%cudnn_rnn.root_function.op_name)
    
    cudnn_param = cudnn_rnn.parameters[0]
    rnn_name = cudnn_rnn.name
    input_var = cudnn_rnn.inputs[0]
    
    hidden_size = cudnn_rnn.root_function.attributes['hiddenSize']
    num_layers = cudnn_rnn.root_function.attributes['numLayers']
    bidirectional = cudnn_rnn.root_function.attributes['bidirectional']
    recurrent_op = cudnn_rnn.root_function.attributes['recurrentOp']

    if recurrent_op not in ['lstm', 'rnnReLU', 'rnnTanh']:
        raise ValueError('unsupported recurrent_op value "%s"'%recurrent_op)
    #note that cudnn GRU is different from standard GRU so no conversion unless creating a new type of GRU cell for CPU

    def _any_inferred(shape):
        return np.any([dim < 0 for dim in shape])
    
    if _any_inferred(cudnn_param.shape) or _any_inferred(input_var.shape):
        raise ValueError('parameter not initialized yet')

    input_size = input_var.shape[0] if len(input_var.shape) else 1
    
    num_gates = 1
    rnn_lambda = None
    if recurrent_op == 'lstm':
        num_gates = 4
        if bidirectional:
            rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_bw'+i), go_backwards=True)(x))
        else:
            rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+"_"+i))(x)
    elif recurrent_op == 'rnnReLU' or recurrent_op == 'rnnTanh':
        num_gates = 1
        activation = C.relu if recurrent_op == 'rnnReLU' else C.tanh
        if bidirectional:
            rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+'_bw'+i), go_backwards=True)(x))
        else:
            rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+"_"+i))(x)

    noncudnn_func = rnn_lambda(input_var, '0')

    param = cudnn_param.value.reshape(-1)
    offset = 0
    multiplier = 2 if bidirectional else 1

    def _adjust_gate_order(W):
        if recurrent_op == 'lstm':
            if len(W.shape) == 2:
                i,f,m,o = np.hsplit(W, 4)
                return np.concatenate((i,m,f,o), axis=1)
            elif len(W.shape) == 1:
                i,f,m,o = np.split(W, 4)
                return np.concatenate((i,m,f,o))
            else:
                raise ValueError('LSTM parameter must have 1 or 2 dimensions')
        else:
            return W

    def _get_cudnn_rnn_weight_splitter(in_dim, h_dim):
        # for unidirectional, W, H
        # for bidirectional, fw_W, fw_H, bw_W, bw_H
        splitter = [in_dim*h_dim*num_gates, h_dim*h_dim*num_gates] * multiplier
        splitter = splitter[0:-1]
        return np.cumsum(splitter)

    def _get_cudnn_rnn_bias_splitter(h_dim):
        # for unidirectional, b1, b2
        # for bidirectional, fw_b1, fw_b2, bw_b1, bw_b2
        splitter = [h_dim*num_gates, h_dim*num_gates] * multiplier
        splitter = splitter[0:-1]
        return np.cumsum(splitter)

    offset = 0
    layer_input_size = input_size
    for layer in range(num_layers):
        layer_size = (layer_input_size + hidden_size) * hidden_size * num_gates * multiplier
        layer_param = param[offset:offset+layer_size]
        layer_name = str(layer)
        if bidirectional:
            fw_Wt, fw_Ht, bw_Wt, bw_Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size))
            fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1)
            bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1)
            fw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(fw_Wt.reshape(num_gates*hidden_size, -1).transpose()))
            fw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(fw_Ht.reshape(num_gates*hidden_size, -1).transpose()))
            bw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(bw_Wt.reshape(num_gates*hidden_size, -1).transpose()))
            bw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(bw_Ht.reshape(num_gates*hidden_size, -1).transpose()))
        else:
            Wt, Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size))
            cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1)
            cell.W.value = np.ascontiguousarray(_adjust_gate_order(Wt.reshape(num_gates*hidden_size, -1).transpose()))
            cell.H.value = np.ascontiguousarray(_adjust_gate_order(Ht.reshape(num_gates*hidden_size, -1).transpose()))

        offset += layer_size
        layer_input_size = hidden_size * multiplier
        
        if layer != num_layers - 1:
            noncudnn_func = rnn_lambda(noncudnn_func.output, str(layer+1))

    for layer in range(num_layers):
        layer_size = 2 * hidden_size * num_gates * multiplier
        layer_param = param[offset:offset+layer_size]
        layer_name = str(layer)
        if bidirectional:
            fw_b1, fw_b2, bw_b1, bw_b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size))
            fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1)
            bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1)
            fw_cell.b.value = _adjust_gate_order(fw_b1 + fw_b2).reshape(-1)
            bw_cell.b.value = _adjust_gate_order(bw_b1 + bw_b2).reshape(-1)
        else:
            b1, b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size))
            cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1)
            cell.b.value = _adjust_gate_order(b1 + b2).reshape(-1)
        offset += layer_size

    return noncudnn_func
Пример #51
0
def create_rpn(conv_out,
               scaled_gt_boxes,
               im_info,
               cfg,
               add_loss_functions=True):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        cfg:             The configuration dictionary
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["MODEL"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3),
                               num_channels,
                               activation=relu,
                               pad=True,
                               strides=1,
                               init=normal(scale=0.01),
                               init_bias=0.0)(conv_out)
    rpn_cls_score = Convolution(
        (1, 1),
        18,
        activation=None,
        name="rpn_cls_score",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution(
        (1, 1),
        36,
        activation=None,
        name="rpn_bbox_pred",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(
        rpn_cls_score,
        (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]),
        name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm,
                                 [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob,
                                   rpn_cls_score.shape,
                                   name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois = create_proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred,
                                     im_info, cfg)

    rpn_losses = None
    if (add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        proposal_layer_params = "'feat_stride': {}\n'scales':\n - {}". \
            format(cfg["MODEL"].FEATURE_STRIDE, "\n - ".join([str(v) for v in cfg["DATA"].PROPOSAL_LAYER_SCALES]))
        atl = user_function(
            AnchorTargetLayer(
                rpn_cls_score,
                scaled_gt_boxes,
                im_info,
                rpn_batch_size=cfg["TRAIN"].RPN_BATCHSIZE,
                rpn_fg_fraction=cfg["TRAIN"].RPN_FG_FRACTION,
                clobber_positives=cfg["TRAIN"].RPN_CLOBBER_POSITIVES,
                positive_overlap=cfg["TRAIN"].RPN_POSITIVE_OVERLAP,
                negative_overlap=cfg["TRAIN"].RPN_NEGATIVE_OVERLAP,
                param_str=proposal_layer_params))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp,
                                            rpn_labels_ignore,
                                            axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(
            rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(
            normalized_rpn_cls_loss,
            [(p_rpn_labels, rpn_labels),
             (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore',
            'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg.SIGMA_RPN_L1, p_rpn_bbox_pred,
                                     p_rpn_bbox_targets,
                                     p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(
            rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(
            normalized_rpn_bbox_loss,
            [(p_rpn_bbox_pred, rpn_bbox_pred),
             (p_rpn_bbox_targets, rpn_bbox_targets),
             (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
            'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls,
                          reduced_rpn_loss_bbox,
                          name="rpn_losses")

    return rpn_rois, rpn_losses
Пример #52
0
def create_model_ext(input, ext_values, out_dims):

    # in VGG style
    #https://www.cs.toronto.edu/~frossard/post/vgg16/
    convolutional_layer_1_1 = Convolution((3, 3),
                                          16,
                                          init=glorot_uniform(),
                                          activation=relu,
                                          pad=True,
                                          strides=(1, 1))(input)
    convolutional_layer_1_2 = Convolution(
        (5, 5),
        32,
        init=glorot_uniform(),
        activation=relu,
        pad=True,
        strides=(1, 1))(convolutional_layer_1_1)
    pooling_layer_1 = MaxPooling((2, 2),
                                 strides=(2, 2))(convolutional_layer_1_2)

    convolutional_layer_2_1 = Convolution((3, 3),
                                          32,
                                          init=glorot_uniform(),
                                          activation=relu,
                                          pad=True,
                                          strides=(1, 1))(pooling_layer_1)
    convolutional_layer_2_2 = Convolution(
        (7, 7),
        64,
        init=glorot_uniform(),
        activation=relu,
        pad=True,
        strides=(1, 1))(convolutional_layer_2_1)
    pooling_layer_2 = MaxPooling((2, 2),
                                 strides=(1, 1))(convolutional_layer_2_2)

    convolutional_layer_3_1 = Convolution((3, 3),
                                          64,
                                          init=glorot_uniform(),
                                          activation=relu,
                                          pad=True,
                                          strides=(1, 1))(pooling_layer_2)
    convolutional_layer_3_2 = Convolution(
        (7, 7),
        96,
        init=glorot_uniform(),
        activation=relu,
        pad=True,
        strides=(1, 1))(convolutional_layer_3_1)
    pooling_layer_3 = MaxPooling((2, 2),
                                 strides=(1, 1))(convolutional_layer_3_2)

    convolutional_layer_4_1 = Convolution((3, 3),
                                          96,
                                          init=glorot_uniform(),
                                          activation=relu,
                                          pad=True,
                                          strides=(1, 1))(pooling_layer_3)
    pooling_layer_4 = MaxPooling((2, 2),
                                 strides=(1, 1))(convolutional_layer_4_1)

    ##
    fully_connected_layer_1 = Dense(512,
                                    init=glorot_uniform())(pooling_layer_4)
    dropout_layer_1 = Dropout(0.5)(fully_connected_layer_1)

    fully_connected_with_extra_values = splice(dropout_layer_1,
                                               ext_values,
                                               axis=0)

    fully_connected_layer_2 = Dense(
        256, init=glorot_uniform())(fully_connected_with_extra_values)
    fully_connected_layer_3 = Dense(
        128, init=glorot_uniform())(fully_connected_layer_2)
    dropout_layer_2 = Dropout(0.5)(fully_connected_layer_3)

    output_layer = Dense(out_dims, init=glorot_uniform(),
                         activation=None)(dropout_layer_2)

    return output_layer
Пример #53
0
 def gru_with_attentioin(dh, x):
     c_att = attention_model(att_input, x)
     x = C.splice(x, c_att)
     x = C.element_times(x, C.sigmoid(C.times(x, Wg)))
     return att_gru(dh, x)