def test_stop_gradient():
    x = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    y = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    z = C.element_times(x, y)
    w = z + C.stop_gradient(z)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2))
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b) * 2
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    assert np.allclose(grad[x], b)
    assert np.allclose(grad[y], a)

    #test stop_gradient with function as input whose arguments should have no gradients (zeros reading)
    w = C.stop_gradient(z)
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b)
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    #there should be no gradients backward to x and y
    assert np.allclose(grad[x], np.zeros_like(b))
    assert np.allclose(grad[y], np.zeros_like(a))
def test_cos_distane_backward():
    x = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    y = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    z = C.cosine_distance(x, y)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-0.5, 1.5, -0.3, -1]), (1, 2, 2))
    bwd, fwd = z.forward({x: a, y: b}, [z.output], set([z.output]))
    value = list(fwd.values())[0]
    expected = [[0.707107, -0.981665]]
    assert np.allclose(value, expected)
    grad = z.backward(bwd, {z.output: np.ones_like(value)}, set([x, y]))
    x_driv_expected = np.ndarray(
        (1, 2, 2),
        dtype=np.float32,
        buffer=np.float32([-1.131371, 0.565686, -0.188727, 0.018873]))
    y_driv_expected = np.ndarray(
        (1, 2, 2),
        dtype=np.float32,
        buffer=np.float32([0.424264, 0.141421, -0.174876, 0.052463]))
    assert (np.all(np.absolute(grad[x] - x_driv_expected) < 1e-6))
    assert (np.all(np.absolute(grad[y] - y_driv_expected) < 1e-6))
def test_cosine_distance_with_negative_samples_with_reduced_sequence():
    a = C.sequence.input_variable((3, ), sequence_axis=C.Axis("a"))
    b = C.sequence.input_variable((3, ), sequence_axis=C.Axis("b"))
    cd = C.cosine_distance_with_negative_samples(C.sequence.first(a),
                                                 C.sequence.first(b), 1, 2)
    data = np.random.random((4, 3)).astype(np.float32)
    cd.eval({a: data, b: data})
Exemplo n.º 4
0
def multi_headed_self_attention_layer(in_dims: int,
                                      hidden_dims: int,
                                      num_of_head: int,
                                      name='multi_headed_self_attention',
                                      as_block: bool = False,
                                      k_ph: bool = False,
                                      v_ph: bool = False,
                                      mask_opt: bool = False) -> C.Function:
    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    outputs = []

    if k_ph is False and v_ph is False:
        for i in range(num_of_head):
            layer = self_attention_layer(in_dims,
                                         hidden_dims,
                                         name=name + str(i),
                                         as_block=not as_block,
                                         mask_opt=mask_opt)
            outputs.append(layer(X))
    elif k_ph is True and v_ph is True:
        k_ = C.placeholder(in_dims,
                           (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                           name=name + '_k_ph')  # -3: sequence axis
        v_ = C.placeholder(in_dims,
                           (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                           name=name + '_v_ph')
        for i in range(num_of_head):
            layer = self_attention_layer(in_dims,
                                         in_dims,
                                         name=name + str(i),
                                         as_block=not as_block,
                                         k_ph=k_ph,
                                         v_ph=v_ph)
            outputs.append(layer(X, k_, v_))
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    concat = C.splice(*outputs, name='concat')

    result = C.layers.Dense(in_dims, name='W_o')(concat)

    # init = C.initializer.normal(1)
    # W_O = C.parameter((in_dims, hidden_dims*num_of_head), init=init, name=name+'_Wo')
    # result = C.times_transpose(concat, W_O, name='result')

    if as_block is True:
        if k_ph is False and v_ph is False:
            result = C.as_block(result, [(X, X)], 'multi_headed_self_attetion',
                                'multi_headed_self_attetion_')
        elif k_ph is True and v_ph is True:
            result = C.as_block(result, [(X, X), (k_, k_), (v_, v_)],
                                'multi_headed_self_attetion',
                                'multi_headed_self_attetion_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')

    return result
def test_axis_str():
    i = C.sequence.input_variable((1, 3))
    assert str(C.Axis.all_axes()) == "Axis('AllAxes')"
    assert str(C.Axis.all_static_axes()) == "Axis('AllStaticAxes')"
    assert str(C.Axis.unknown_dynamic_axes()) == "(Axis('UnknownAxes'),)"
    assert str(C.Axis(1)) == "Axis('staticAxisIdx=1')"
    assert str(C.Axis(-1)) == "Axis('staticAxisIdx=-1')"
    assert str(i.dynamic_axes) == "(Axis('defaultBatchAxis'), Axis('defaultDynamicAxis'))"
def test_clone_with_different_dynamic_axes():
    q_axis = C.Axis('q')
    a_axis = C.Axis('a')
    question_input = C.sequence.input(shape=10, is_sparse=True, sequence_axis=q_axis)
    answer_input = C.sequence.input(shape=10, is_sparse=True, sequence_axis=a_axis)

    rnn = C.layers.Recurrence(C.layers.LSTM(5))(question_input)
    rnn_cloned = rnn.clone(C.CloneMethod.share, {question_input:answer_input})
Exemplo n.º 7
0
def test_rank0_output():
  x = C.sequence.input_variable(shape=(768,), sequence_axis=C.Axis("B"), needs_gradient=True)
  y = C.sequence.input_variable(shape=(768,), sequence_axis=C.Axis("B"), needs_gradient=True)
  z = C.cosine_distance(x, y)
  batch_num = 2
  batch_size = 30
  a = np.float32(np.random.rand(batch_num*batch_size,1500,768))
  b = np.float32(np.random.rand(batch_num*batch_size,1500,768))
  for i in range(batch_num):
    bwd, fwd = z.forward({x:a[i*batch_size:(i+1)*batch_size], y:b[i*batch_size:(i+1)*batch_size]}, [z.output], set([z.output]))
    grad = z.backward(bwd, {z.output:np.ones_like(fwd[z.output])}, set([x, y]))
Exemplo n.º 8
0
def test_GRU(tmpdir):
    def MakeGRUNameFromConfig(backward, initial_state, activition):
        model_name = 'GRU.' + activition.__name__
        if (initial_state != 0):
            model_name += '.initial'
        if (backward):        
            model_name += '.backward'
        else:    
            model_name += '.forward'
        return model_name 

    direction_options = [False, True]
    activation_options = [C.tanh]
    initial_state_options = [0]

    input_dim = 2
    cell_dim = 3
    batch_size = 1
    sequence_len = 5

    for config in list(product(direction_options, initial_state_options, activation_options)):
        model_filename = MakeGRUNameFromConfig(*config)
        print(model_filename)
        backward, initial_state, activation =  config
    
        x = C.input_variable(input_dim, dynamic_axes=[C.Axis.default_batch_axis(), C.Axis('sequenceAxis')]) 
        GRUModel = C.layers.Recurrence(C.layers.GRU(cell_dim,     
                                                    activation = activation),   
                                       initial_state = initial_state,    
                                       go_backwards=backward)(x)
        #CLG.plot(GRUModel, filename=cntk_pdf_filename)
        #plot_block_internals(GRUModel, 'GRU', model_filename)
        data = np.random.uniform(low=0.0, high=1.0, size=(batch_size, sequence_len, input_dim)).astype('f')
        verify_one_input(GRUModel, data, tmpdir, model_filename)
Exemplo n.º 9
0
def test_LSTM(tmpdir):
    pytest.skip('Need to support new ONNX spec.')

    def CreateLSTMModel(activation, peepholes, self_stabilization, cell_dim,
                        initial_state):
        return C.layers.Sequential([
            C.layers.Recurrence(C.layers.LSTM(
                cell_dim,
                use_peepholes=peepholes,
                activation=activation,
                enable_self_stabilization=self_stabilization),
                                initial_state=initial_state)
        ])

    def MakeLSTMNameFromConfig(use_peepholes, enable_self_stabilization,
                               initial_state, activition):
        model_name = 'LSTM.' + activition.__name__
        if (use_peepholes):
            model_name += '.peephole'
        if (enable_self_stabilization):
            model_name += '.stabilize'
        if (initial_state != 0):
            model_name += '.initial'
        return model_name

    # lstm attributes
    use_peepholes_options = [False]
    enable_self_stabilization_options = [False]
    activation_options = [C.tanh]

    #Recurrence attributes
    initial_state_options = [0, 0.23]

    input_dim = 2
    cell_dim = 3
    batch_size = 1
    sequence_len = 5

    for config in list(
            product(use_peepholes_options, enable_self_stabilization_options,
                    initial_state_options, activation_options)):
        model_filename = MakeLSTMNameFromConfig(*config)
        use_peepholes, enable_self_stabilization, initial_state, activation = config

        x = C.input_variable(
            input_dim,
            dynamic_axes=[C.Axis.default_batch_axis(),
                          C.Axis('sequenceAxis')])
        LSTMmodel = CreateLSTMModel(
            peepholes=use_peepholes,
            activation=activation,
            initial_state=initial_state,
            cell_dim=cell_dim,
            self_stabilization=enable_self_stabilization)(x)
        data = np.random.uniform(low=0.0,
                                 high=1.0,
                                 size=(batch_size, sequence_len,
                                       input_dim)).astype('f')
        verify_one_input(LSTMmodel, data, tmpdir, model_filename)
Exemplo n.º 10
0
def test_sequence_max():
    np.random.seed(0)
    a = np.float32(np.random.rand(20, 100, 8))
    src = C.sequence.input_variable(shape=(8), sequence_axis=C.Axis("Seq"))
    out = C.sequence.reduce_max(src)
    val = out.eval({src: a})
    expected = np.max(a, 1)
    assert np.allclose(val, expected)
Exemplo n.º 11
0
def test_stop_gradient():
    x = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    y = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    z = C.element_times(x, y)
    w = z + C.stop_gradient(z)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2))
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b) * 2
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    assert np.allclose(grad[x], b)
    assert np.allclose(grad[y], a)
Exemplo n.º 12
0
def test_sequence_softmax_with_large_numbers():
    np.random.seed(0)
    a = [500000 * np.ones(i, dtype=np.float32) for i in (7, 7, 7)]
    src = C.sequence.input_variable(shape=(1), sequence_axis=C.Axis("Seq"))
    out = C.sequence.softmax(src)
    val = out.eval({src: a})
    expected = [np_softmax(a_i, 0) for a_i in a]
    for val_i, expected_i in zip(val, expected):
        assert np.allclose(val_i, expected_i)
Exemplo n.º 13
0
def test_cos_distane_backward2():
  x = C.sequence.input_variable(shape=(100,), sequence_axis=C.Axis("B"), needs_gradient=True)
  y = C.sequence.input_variable(shape=(100,), sequence_axis=C.Axis("B"), needs_gradient=True)
  z = C.cosine_distance(x, y);
  np.random.seed(0)
  a = np.float32(np.random.rand(10,50,100))
  b = np.float32(np.random.rand(10,50,100))
  bwd, fwd = z.forward({x:a, y:b}, [z.output], set([z.output]))
  value = list(fwd.values())[0]
  expected_cos = numpy_cos(a,b)
  expected = expected_cos.forward()
  assert np.allclose(value, expected)
  grad = z.backward(bwd, {z.output:np.ones_like(value)}, set([x, y]))
  bwd = expected_cos.backward()
  x_driv_expected = bwd['a']
  y_driv_expected = bwd['b']
  assert (np.all(np.absolute(grad[x]-x_driv_expected) < 1e-6))
  assert (np.all(np.absolute(grad[y]-y_driv_expected) < 1e-6))
Exemplo n.º 14
0
def test_sequence_max_with_variable_lengths():
    np.random.seed(0)
    a = [-np.ones(i, dtype=np.float32) for i in (7, 11, 13)]
    src = C.sequence.input_variable(shape=(1), sequence_axis=C.Axis("Seq"))
    out = C.sequence.reduce_max(src)
    val = out.eval({src: a})
    expected = [np.max(a_i) for a_i in a]
    for val_i, expected_i in zip(val, expected):
        assert np.allclose(val_i, expected_i)
Exemplo n.º 15
0
def decoder(in_dims: int,
            sa_dims: int,
            head_dims: int,
            hidden_dims: int,
            kv_memory,
            name: str = 'decoder',
            as_block: bool = False) -> C.Function:
    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')
    k_memory = C.placeholder(in_dims,
                             (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                             name=name + '_k_memory')
    v_memory = C.placeholder(in_dims,
                             (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                             name=name + '_v_memory')
    # placeholder 는 clone이 안되서 k, v를 kv로 하나의 placeholder로서 묶으면 안됨

    mhsa_layer = multi_headed_self_attention_layer(in_dims,
                                                   sa_dims,
                                                   head_dims,
                                                   mask_opt=True)
    eda_layer = multi_headed_self_attention_layer(in_dims,
                                                  sa_dims,
                                                  head_dims,
                                                  k_ph=True,
                                                  v_ph=True)
    ff_layer = feed_forward_layer(in_dims, hidden_dims)

    sa = layer_normalization(X + mhsa_layer(X))  # w/o mask
    eda = layer_normalization(sa + eda_layer(sa, k_memory, v_memory))
    ff = layer_normalization(eda + ff_layer(eda))

    result = ff
    if as_block is True:
        return C.as_block(result, [(X, X), (k_memory, k_memory),
                                   (v_memory, v_memory)], name)
    else:
        return result
def test_cosine_distance():
    a = np.reshape(np.arange(25.0, dtype=np.float32), (5, 5))
    b = np.reshape(np.arange(0, 5, dtype=np.float32), (1, 5))

    src = C.sequence.input_variable(shape=(5), sequence_axis=C.Axis("Seq"))
    tgt = C.input_variable(shape=(5))
    tgt_br = C.sequence.broadcast_as(tgt, src)
    cos_seq = C.cosine_distance(src, tgt_br)
    assert len(cos_seq.dynamic_axes) == 2
    assert cos_seq.dynamic_axes[1].name == "Seq"
    val = cos_seq.eval({src: [a], tgt: [b]})
    expected = [[1., 0.914659, 0.878459, 0.86155, 0.851852]]
    assert np.allclose(val, expected)
Exemplo n.º 17
0
    def __init__(self, config_file):
        data_config = importlib.import_module(config_file).data_config
        model_config = importlib.import_module(config_file).model_config

        self.word_count_threshold = data_config['word_count_threshold']
        self.char_count_threshold = data_config['char_count_threshold']
        self.word_size = data_config['word_size']
        self.abs_path = os.path.dirname(os.path.abspath(__file__))
        pickle_file = os.path.join(self.abs_path, data_config['pickle_file'])

        with open(pickle_file, 'rb') as vf:
            known, self.vocab, self.chars = pickle.load(vf)

        self.wg_dim = known
        self.wn_dim = len(self.vocab) - known
        self.c_dim = len(self.chars)
        self.a_dim = 1

        self.hidden_dim = model_config['hidden_dim']
        self.w2v_hidden_dim = model_config['w2v_hidden_dim']
        self.convs = model_config['char_convs']
        self.dropout = model_config['dropout']
        self.char_emb_dim = model_config['char_emb_dim']
        self.highway_layers = model_config['highway_layers']
        self.two_step = model_config['two_step']
        self.use_cudnn = model_config['use_cudnn']
        self.use_sparse = True
        
        # Source and target inputs to the model
        inputAxis = C.Axis('inputAxis')
        outputAxis = C.Axis('outputAxis')
        InputSequence = C.layers.SequenceOver[inputAxis]
        OutputSequence = C.layers.SequenceOver[outputAxis]

        print('dropout', self.dropout)
        print('use_cudnn', self.use_cudnn)
        print('use_sparse', self.use_sparse)
Exemplo n.º 18
0
def test_LSTM(tmpdir):
    for config in list(
            product(use_peepholes_options, enable_self_stabilization_options,
                    initial_state_options, activation_options)):
        model_filename = MakeLSTMNameFromConfig(*config)
        use_peepholes, enable_self_stabilization, initial_state, activation = config

        x = C.input_variable(
            input_dim,
            dynamic_axes=[Axis.default_batch_axis(),
                          C.Axis('sequenceAxis')])
        LSTMmodel = CreateLSTMModel(
            peepholes=use_peepholes,
            activation=activation,
            initial_state=initial_state,
            cell_dim=cell_dim,
            self_stabilization=enable_self_stabilization)(x)
        data = np.random.uniform(low=0.0,
                                 high=1.0,
                                 size=(batch_size, sequence_len,
                                       input_dim)).astype('f')
        verify_one_input(LSTMmodel, data, tmpdir, model_filename)
Exemplo n.º 19
0
def test_sequence_unpack_with_broadcast_as(device_id, precision):
    x = C.sequence.input_variable(5)
    a = C.sequence.input_variable(4, sequence_axis=C.Axis('a'))
    y, mask = C.sequence.unpack(x, 0).outputs
    bvm = C.sequence.broadcast_as(0 * C.reduce_sum(y) + mask, a)

    x1 = [
        np.arange(7 * 5).reshape(7, 5).astype('f'),
        np.arange(3 * 5).reshape(3, 5).astype('f')
    ]
    a1 = [
        np.arange(3 * 4).reshape(3, 4).astype('f'),
        np.arange(6 * 4).reshape(6, 4).astype('f')
    ]

    expected = [
        np.ones((3, 7), dtype=np.float32),
        np.ones((6, 7), dtype=np.float32)
    ]
    expected[1][:, 3:] = 0

    actual = bvm.eval({x: x1, a: a1})
    for actual_i, expected_i in zip(actual, expected):
        assert np.allclose(actual_i, expected_i)
Exemplo n.º 20
0
def test_lstm_over_lstm_thought_vectors_2(device_id):
    dev = cntk_device(device_id)
    input_vocab_size = 3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    utterances_input = C.sequence.input_variable((input_vocab_size),
                                                 is_sparse=True,
                                                 name='utterances')
    conversation_lengths_input = C.input_variable(
        (), name='conversation_sequence_lengths')
    label_input = C.sequence.input_variable(
        num_labels,
        is_sparse=True,
        sequence_axis=C.Axis('label_sequence'),
        name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(utterances_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.user_function(
            UtteranceBatchReshape(model, conversation_lengths_input))
        model = C.to_sequence_like(model, label_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_input)

    sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]),
                                               device=C.cpu())
    c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0],
                                                   [1, 0, 0]]),
                                          device=C.cpu())
    c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]),
                                          device=C.cpu())
    c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]),
                                          device=C.cpu())
    c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu())
    c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1],
                                                   [1, 0, 0]]),
                                          device=C.cpu())
    c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu())

    all_utt_data = C.Value.create(C.sequence.input_variable(
        (input_vocab_size), is_sparse=True), [
            c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data,
            sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data,
            sentinel_utt_data
        ],
                                  device=C.cpu()).data
    conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32)
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0]]
    seq3_label_data = [[1, 0], [0, 1]]
    label_data = [
        _to_csr(seq1_label_data),
        _to_csr(seq2_label_data),
        _to_csr(seq3_label_data)
    ]
    param_grads, loss_result = ce.grad(
        {
            utterances_input: all_utt_data,
            label_input: label_data,
            conversation_lengths_input: conversation_lengths_data
        },
        wrt=ce.parameters,
        outputs=[ce],
        as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.01
    assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]],
                       atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance)
    assert np.allclose(loss_result[2], [[0.705393], [0.674243]],
                       atol=absolute_tolerance)
Exemplo n.º 21
0
def test_RNN(tmpdir, dtype):

    with C.default_options(dtype=dtype):

        def CreatRNN(cell_dim,
                     activation,
                     initial_state,
                     direction,
                     num_layers,
                     init=C.default_override_or(C.glorot_uniform()),
                     init_bias=C.default_override_or(0)):
            if direction == 'bidirectional':
                return C.layers.Sequential([
                    C.layers.For(
                        range(num_layers), lambda i: [
                            (C.layers.Recurrence(C.layers.RNNStep(
                                cell_dim,
                                activation=activation,
                                init=init,
                                init_bias=init_bias),
                                                 initial_state=initial_state,
                                                 return_full_state=False,
                                                 go_backwards=False),
                             C.layers.Recurrence(C.layers.RNNStep(
                                 cell_dim,
                                 activation=activation,
                                 init=init,
                                 init_bias=init_bias),
                                                 initial_state=initial_state,
                                                 return_full_state=False,
                                                 go_backwards=True)), C.splice
                        ])
                ])
            else:
                go_backward = False if direction == 'forward' else True
                return C.layers.Sequential([
                    C.layers.For(
                        range(num_layers), lambda i: [
                            C.layers.Recurrence(C.layers.RNNStep(
                                cell_dim,
                                activation=activation,
                                init=init,
                                init_bias=init_bias),
                                                initial_state=initial_state,
                                                return_full_state=False,
                                                go_backwards=go_backward)
                        ])
                ])

        def MakeRNNNameFromConfig(direction, num_layers, initial_state,
                                  activition):
            model_name = 'RNN.' + direction + '.'

            if num_layers == 1:
                model_name += 'one_layer.'
            else:
                assert (num_layers == 2), "needs 1 or 2 layers!"
                model_name += 'two_layer.'

            if (initial_state != 0):
                model_name += 'initial.'

            model_name += activition.__name__
            return model_name

        direction_options = ['forward', 'reverse', 'bidirectional']
        num_layers_options = [1, 2]
        initial_state_options = [0]
        activation_options = [C.tanh, C.relu, C.sigmoid]

        input_dim = 2
        hidden_dim = 3
        batch_size = 1
        sequence_len = 5

        for config in list(
                product(direction_options, num_layers_options,
                        initial_state_options, activation_options)):
            model_filename = MakeRNNNameFromConfig(*config)
            print(model_filename)
            direction, num_layers, initial_state, activation = config

            x = C.input_variable(input_dim,
                                 dynamic_axes=[
                                     C.Axis.default_batch_axis(),
                                     C.Axis('sequenceAxis')
                                 ])
            RNNModel = CreatRNN(hidden_dim, activation, initial_state,
                                direction, num_layers)(x)
            data = np.random.uniform(low=0.0,
                                     high=1.0,
                                     size=(batch_size, sequence_len,
                                           input_dim)).astype(dtype)
            verify_one_input(RNNModel, data, tmpdir, model_filename)
Exemplo n.º 22
0
def self_attention_layer(in_dims: int,
                         out_dims: int,
                         name='self_attention',
                         as_block: bool = False,
                         k_ph: bool = False,
                         v_ph: bool = False,
                         mask_opt: bool = False) -> C.Function:
    sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims')

    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    if k_ph is False and v_ph is False:
        q = C.layers.Dense(out_dims, name=name + '_q')(
            X
        )  # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q')
        k = C.layers.Dense(out_dims, name=name + '_k')(
            X
        )  # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k')
        v = C.layers.Dense(out_dims, name=name + '_v')(
            X
        )  # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v')
    elif k_ph is True and v_ph is True:
        q = C.layers.Dense(out_dims, name=name + '_q')(X)
        k = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_k_ph')
        v = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_v_ph')
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q')
    k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k')
    v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v')

    scores = C.times_transpose(q_, k_, name=name + '_score_matrix')
    scaled = scores / sq_sa_dims  # div_k

    if mask_opt:
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')
        scaled = C.element_min(scaled, inf_mask)

    softmax = C.softmax(scaled, name=name + '_softmax')
    attention = C.times(softmax, v_, name=name + '_attention')

    result = C.to_sequence_like(attention, X)

    if as_block:
        if k_ph is False and v_ph is False:
            return C.as_block(result, [(X, X)], 'self_attention',
                              'self_attention_')
        elif k_ph is True and v_ph is True:
            return C.as_block(result, [(X, X), (k, k), (v, v)],
                              'self_attention', 'self_attention_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')
    else:
        return result
Exemplo n.º 23
0
    evaluate_decoder(test_reader, model, i2w)


# ============= configure =====================
input_vocab_dim = 69
label_vocab_dim = 69

hidden_dim = 512
num_layers = 2
attention_dim = 128
use_attention = True
use_embedding = True
embedding_dim = 200
length_increase = 1.5

InputSequence = C.layers.SequenceOver[C.Axis('inputAxis')]
LabelSequence = C.layers.SequenceOver[C.Axis('labelAxis')]

vocab, i2w, _ = get_vocab(dataPath['vocab_file'])
train_reader = create_reader(dataPath['training'], True)
valid_reader = create_reader(dataPath['validation'], True)

sentence_start = C.Constant(
    np.array([w == '<s>' for w in vocab], dtype=np.float))
sentence_end_idx = vocab.index('</s>')  # first </s>

if __name__ == '__main__':
    model = create_model()
    a = model.find_by_name('encode_h')
    #x = x.root_function
    print(a)
def trainNetwork():

    mapper, gens = loadData(dir + fileName,
                            './data/Shakespeare',
                            batchSize,
                            timeSteps,
                            timeShift,
                            load=False,
                            lineShape=(0, 40000))

    # Input with dynamic sequence axis
    # consisting of a matrix of [steps-in-time X number-of-possible-characters]
    inputSeqAxis = cntk.Axis('inputAxis')
    input = cntk.sequence.input_variable((timeSteps, mapper.numClasses),
                                         sequence_axis=inputSeqAxis,
                                         name='input')

    model = createNetwork(input, layers, mapper.numClasses)

    label = cntk.sequence.input_variable(mapper.numClasses,
                                         sequence_axis=inputSeqAxis,
                                         name='label')

    z = model(input)
    loss = cntk.cross_entropy_with_softmax(z, label)
    error = cntk.classification_error(z, label)

    printer = cntk.logging.ProgressPrinter(tag='Training',
                                           freq=100,
                                           num_epochs=maxEpochs)

    lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.001)
    momentum_schedule = cntk.momentum_schedule_per_sample(0.9990913221888589)
    learner = cntk.momentum_sgd(z.parameters,
                                lr_per_sample,
                                momentum_schedule,
                                gradient_clipping_threshold_per_sample=5.0,
                                gradient_clipping_with_truncation=True)

    #learner = cntk.momentum_sgd(z.parameters, lr, 0.9, minibatch_size=batchSize)
    #learner = cntk.fsadagrad(model.parameters, lr=lr, minibatch_size=batchSize, momentum=0.9, unit_gain=True)
    trainer = cntk.Trainer(z, (loss, error), learner, [printer])

    numMinibatch = mapper.samples // batchSize

    print("Input sequence length: {}; unique characters {};".format(
        timeSteps, mapper.numClasses))
    cntk.logging.log_number_of_parameters(z)
    print("Datset size {}; {} Epochs; {} minibatches per epoch".format(
        mapper.samples, maxEpochs, numMinibatch))

    for epoch in range(maxEpochs):
        mask = [True]
        for mb in range(numMinibatch):
            X, Y = next(gens['train'])
            #X, Y = get_data(mb, batchSize, data, mapper)
            arguments = ({input: X, label: Y}, mask)
            mask = [False]
            trainer.train_minibatch(arguments)

            if mb % 100 == 0:
                print(generateText(z, mapper, 200) + '\n')

        trainer.summarize_training_progress()
        print(generateText(z, mapper, 100))
Exemplo n.º 25
0
def test_axis():
    a = C.Axis(1)
    assert isinstance(a.is_static_axis, bool)
    assert a.is_static_axis == True
    assert a.static_axis_index() == 1
Exemplo n.º 26
0
    return pos_encoding


#endregion

if __name__ == '__main__':
    VOCAB_DIMS = 100  # size of vocabulary
    TOKEN_DIMS = 4  # size of tokens (# of embedding)
    SA_DIMS = 3  # size of self attention
    HEAD_DIMS = 8  # size of multi-headed self attention
    HIDDEN_DIMS = 24  # feed forward layer hidden

    v = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [0, 1, 0, 0]], np.float32)  # seq
    X = C.sequence.input_variable(TOKEN_DIMS,
                                  name='encoder_input',
                                  sequence_axis=C.Axis('encoder_seq'))

    #region encoder model
    encoder_model = encoder(TOKEN_DIMS,
                            SA_DIMS,
                            HEAD_DIMS,
                            HIDDEN_DIMS,
                            as_block=False)(X)
    print(encoder_model.eval({encoder_model.arguments[0]: v}))
    #endregion

    #region encoder-decoder model
    input_size = 6
    y = np.array(range(TOKEN_DIMS * input_size),
                 np.float32).reshape(input_size, TOKEN_DIMS)
    Y = C.sequence.input_variable(
Exemplo n.º 27
0
def test_lstm_over_lstm_thought_vectors(device_id):
    dev = cntk_device(device_id)
    input_vocab_size = 3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable(
        (C.FreeDimension, input_vocab_size), is_sparse=True, name='features')
    label_seq_input = C.sequence.input_variable(
        num_labels,
        is_sparse=True,
        sequence_axis=C.Axis('label_sequence'),
        name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.to_sequence_like(model, label_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]],
                 [[1, 1, 0], [0, 0, 1], [1, 0, 1]],
                 [[1, 0, 0], [0, 0, 1], [1, 1, 0]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1,
                                          shape=(3, 3, 3),
                                          device=C.cpu())
    seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]],
                 [[0, 1, 0], [1, 0, 1], [0, 0, 0]]]
    csr_seq2 = _to_csr(seq2_data)
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2,
                                          shape=(2, 3, 3),
                                          device=C.cpu())
    x_seq_data = C.Value.create(C.sequence.input_variable((3, 3),
                                                          is_sparse=True),
                                [ndarrayview1, ndarrayview2],
                                device=C.cpu()).data

    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads, loss_result = ce.grad(
        {
            x_seq_input: x_seq_data,
            label_seq_input: label_seq_data
        },
        wrt=ce.parameters,
        outputs=[ce],
        as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.02
    assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]],
                       atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.685199], [0.681736]],
                       atol=absolute_tolerance)
Exemplo n.º 28
0
def test_lstm_over_lstm_thought_vectors(device_id):
    dev = cntk_device(device_id)
    input_vocab_size = 3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable(
        (C.FreeDimension, input_vocab_size), is_sparse=True, name='features')
    label_seq_input = C.sequence.input_variable(
        num_labels,
        is_sparse=True,
        sequence_axis=C.Axis('label_sequence'),
        name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.to_sequence_like(model, label_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]],
                 [[1, 1, 0], [0, 0, 1], [1, 0, 1]],
                 [[1, 0, 0], [0, 0, 1], [1, 1, 0]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1,
                                          shape=(3, 3, 3),
                                          device=C.cpu())
    seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]],
                 [[0, 1, 0], [1, 0, 1], [0, 0, 0]]]
    csr_seq2 = _to_csr(seq2_data)
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2,
                                          shape=(2, 3, 3),
                                          device=C.cpu())
    x_seq_data = C.Value.create(C.sequence.input_variable((3, 3),
                                                          is_sparse=True),
                                [ndarrayview1, ndarrayview2],
                                device=C.cpu()).data

    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads, loss_result = ce.grad(
        {
            x_seq_input: x_seq_data,
            label_seq_input: label_seq_data
        },
        wrt=ce.parameters,
        outputs=[ce],
        as_numpy=False)

    loss_result = loss_result.as_sequences()

    # TODO: The tolerance here is inordinately high due to the non-determinism in initialization
    # of parameters as the individual tests are not run in separate processes resulting in the
    # addition or removal of tests to affect the random initialization of parameters in all other
    # tests that do not explicitly specify the random seed. The tolerance should be lowered to
    # 0.01 after this issue in the test infrastructure has been fixed.
    absolute_tolerance = 0.02
    assert np.allclose(loss_result[0], [[0.63504], [0.673343], [0.698446]],
                       atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.772344], [0.64295]],
                       atol=absolute_tolerance)