def test_stop_gradient(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) * 2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a) #test stop_gradient with function as input whose arguments should have no gradients (zeros reading) w = C.stop_gradient(z) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) #there should be no gradients backward to x and y assert np.allclose(grad[x], np.zeros_like(b)) assert np.allclose(grad[y], np.zeros_like(a))
def test_cos_distane_backward(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.cosine_distance(x, y) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-0.5, 1.5, -0.3, -1]), (1, 2, 2)) bwd, fwd = z.forward({x: a, y: b}, [z.output], set([z.output])) value = list(fwd.values())[0] expected = [[0.707107, -0.981665]] assert np.allclose(value, expected) grad = z.backward(bwd, {z.output: np.ones_like(value)}, set([x, y])) x_driv_expected = np.ndarray( (1, 2, 2), dtype=np.float32, buffer=np.float32([-1.131371, 0.565686, -0.188727, 0.018873])) y_driv_expected = np.ndarray( (1, 2, 2), dtype=np.float32, buffer=np.float32([0.424264, 0.141421, -0.174876, 0.052463])) assert (np.all(np.absolute(grad[x] - x_driv_expected) < 1e-6)) assert (np.all(np.absolute(grad[y] - y_driv_expected) < 1e-6))
def test_cosine_distance_with_negative_samples_with_reduced_sequence(): a = C.sequence.input_variable((3, ), sequence_axis=C.Axis("a")) b = C.sequence.input_variable((3, ), sequence_axis=C.Axis("b")) cd = C.cosine_distance_with_negative_samples(C.sequence.first(a), C.sequence.first(b), 1, 2) data = np.random.random((4, 3)).astype(np.float32) cd.eval({a: data, b: data})
def multi_headed_self_attention_layer(in_dims: int, hidden_dims: int, num_of_head: int, name='multi_headed_self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') outputs = [] if k_ph is False and v_ph is False: for i in range(num_of_head): layer = self_attention_layer(in_dims, hidden_dims, name=name + str(i), as_block=not as_block, mask_opt=mask_opt) outputs.append(layer(X)) elif k_ph is True and v_ph is True: k_ = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') # -3: sequence axis v_ = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') for i in range(num_of_head): layer = self_attention_layer(in_dims, in_dims, name=name + str(i), as_block=not as_block, k_ph=k_ph, v_ph=v_ph) outputs.append(layer(X, k_, v_)) else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') concat = C.splice(*outputs, name='concat') result = C.layers.Dense(in_dims, name='W_o')(concat) # init = C.initializer.normal(1) # W_O = C.parameter((in_dims, hidden_dims*num_of_head), init=init, name=name+'_Wo') # result = C.times_transpose(concat, W_O, name='result') if as_block is True: if k_ph is False and v_ph is False: result = C.as_block(result, [(X, X)], 'multi_headed_self_attetion', 'multi_headed_self_attetion_') elif k_ph is True and v_ph is True: result = C.as_block(result, [(X, X), (k_, k_), (v_, v_)], 'multi_headed_self_attetion', 'multi_headed_self_attetion_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') return result
def test_axis_str(): i = C.sequence.input_variable((1, 3)) assert str(C.Axis.all_axes()) == "Axis('AllAxes')" assert str(C.Axis.all_static_axes()) == "Axis('AllStaticAxes')" assert str(C.Axis.unknown_dynamic_axes()) == "(Axis('UnknownAxes'),)" assert str(C.Axis(1)) == "Axis('staticAxisIdx=1')" assert str(C.Axis(-1)) == "Axis('staticAxisIdx=-1')" assert str(i.dynamic_axes) == "(Axis('defaultBatchAxis'), Axis('defaultDynamicAxis'))"
def test_clone_with_different_dynamic_axes(): q_axis = C.Axis('q') a_axis = C.Axis('a') question_input = C.sequence.input(shape=10, is_sparse=True, sequence_axis=q_axis) answer_input = C.sequence.input(shape=10, is_sparse=True, sequence_axis=a_axis) rnn = C.layers.Recurrence(C.layers.LSTM(5))(question_input) rnn_cloned = rnn.clone(C.CloneMethod.share, {question_input:answer_input})
def test_rank0_output(): x = C.sequence.input_variable(shape=(768,), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(768,), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.cosine_distance(x, y) batch_num = 2 batch_size = 30 a = np.float32(np.random.rand(batch_num*batch_size,1500,768)) b = np.float32(np.random.rand(batch_num*batch_size,1500,768)) for i in range(batch_num): bwd, fwd = z.forward({x:a[i*batch_size:(i+1)*batch_size], y:b[i*batch_size:(i+1)*batch_size]}, [z.output], set([z.output])) grad = z.backward(bwd, {z.output:np.ones_like(fwd[z.output])}, set([x, y]))
def test_GRU(tmpdir): def MakeGRUNameFromConfig(backward, initial_state, activition): model_name = 'GRU.' + activition.__name__ if (initial_state != 0): model_name += '.initial' if (backward): model_name += '.backward' else: model_name += '.forward' return model_name direction_options = [False, True] activation_options = [C.tanh] initial_state_options = [0] input_dim = 2 cell_dim = 3 batch_size = 1 sequence_len = 5 for config in list(product(direction_options, initial_state_options, activation_options)): model_filename = MakeGRUNameFromConfig(*config) print(model_filename) backward, initial_state, activation = config x = C.input_variable(input_dim, dynamic_axes=[C.Axis.default_batch_axis(), C.Axis('sequenceAxis')]) GRUModel = C.layers.Recurrence(C.layers.GRU(cell_dim, activation = activation), initial_state = initial_state, go_backwards=backward)(x) #CLG.plot(GRUModel, filename=cntk_pdf_filename) #plot_block_internals(GRUModel, 'GRU', model_filename) data = np.random.uniform(low=0.0, high=1.0, size=(batch_size, sequence_len, input_dim)).astype('f') verify_one_input(GRUModel, data, tmpdir, model_filename)
def test_LSTM(tmpdir): pytest.skip('Need to support new ONNX spec.') def CreateLSTMModel(activation, peepholes, self_stabilization, cell_dim, initial_state): return C.layers.Sequential([ C.layers.Recurrence(C.layers.LSTM( cell_dim, use_peepholes=peepholes, activation=activation, enable_self_stabilization=self_stabilization), initial_state=initial_state) ]) def MakeLSTMNameFromConfig(use_peepholes, enable_self_stabilization, initial_state, activition): model_name = 'LSTM.' + activition.__name__ if (use_peepholes): model_name += '.peephole' if (enable_self_stabilization): model_name += '.stabilize' if (initial_state != 0): model_name += '.initial' return model_name # lstm attributes use_peepholes_options = [False] enable_self_stabilization_options = [False] activation_options = [C.tanh] #Recurrence attributes initial_state_options = [0, 0.23] input_dim = 2 cell_dim = 3 batch_size = 1 sequence_len = 5 for config in list( product(use_peepholes_options, enable_self_stabilization_options, initial_state_options, activation_options)): model_filename = MakeLSTMNameFromConfig(*config) use_peepholes, enable_self_stabilization, initial_state, activation = config x = C.input_variable( input_dim, dynamic_axes=[C.Axis.default_batch_axis(), C.Axis('sequenceAxis')]) LSTMmodel = CreateLSTMModel( peepholes=use_peepholes, activation=activation, initial_state=initial_state, cell_dim=cell_dim, self_stabilization=enable_self_stabilization)(x) data = np.random.uniform(low=0.0, high=1.0, size=(batch_size, sequence_len, input_dim)).astype('f') verify_one_input(LSTMmodel, data, tmpdir, model_filename)
def test_sequence_max(): np.random.seed(0) a = np.float32(np.random.rand(20, 100, 8)) src = C.sequence.input_variable(shape=(8), sequence_axis=C.Axis("Seq")) out = C.sequence.reduce_max(src) val = out.eval({src: a}) expected = np.max(a, 1) assert np.allclose(val, expected)
def test_stop_gradient(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) * 2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a)
def test_sequence_softmax_with_large_numbers(): np.random.seed(0) a = [500000 * np.ones(i, dtype=np.float32) for i in (7, 7, 7)] src = C.sequence.input_variable(shape=(1), sequence_axis=C.Axis("Seq")) out = C.sequence.softmax(src) val = out.eval({src: a}) expected = [np_softmax(a_i, 0) for a_i in a] for val_i, expected_i in zip(val, expected): assert np.allclose(val_i, expected_i)
def test_cos_distane_backward2(): x = C.sequence.input_variable(shape=(100,), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(100,), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.cosine_distance(x, y); np.random.seed(0) a = np.float32(np.random.rand(10,50,100)) b = np.float32(np.random.rand(10,50,100)) bwd, fwd = z.forward({x:a, y:b}, [z.output], set([z.output])) value = list(fwd.values())[0] expected_cos = numpy_cos(a,b) expected = expected_cos.forward() assert np.allclose(value, expected) grad = z.backward(bwd, {z.output:np.ones_like(value)}, set([x, y])) bwd = expected_cos.backward() x_driv_expected = bwd['a'] y_driv_expected = bwd['b'] assert (np.all(np.absolute(grad[x]-x_driv_expected) < 1e-6)) assert (np.all(np.absolute(grad[y]-y_driv_expected) < 1e-6))
def test_sequence_max_with_variable_lengths(): np.random.seed(0) a = [-np.ones(i, dtype=np.float32) for i in (7, 11, 13)] src = C.sequence.input_variable(shape=(1), sequence_axis=C.Axis("Seq")) out = C.sequence.reduce_max(src) val = out.eval({src: a}) expected = [np.max(a_i) for a_i in a] for val_i, expected_i in zip(val, expected): assert np.allclose(val_i, expected_i)
def decoder(in_dims: int, sa_dims: int, head_dims: int, hidden_dims: int, kv_memory, name: str = 'decoder', as_block: bool = False) -> C.Function: X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') k_memory = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_memory') v_memory = C.placeholder(in_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_memory') # placeholder 는 clone이 안되서 k, v를 kv로 하나의 placeholder로서 묶으면 안됨 mhsa_layer = multi_headed_self_attention_layer(in_dims, sa_dims, head_dims, mask_opt=True) eda_layer = multi_headed_self_attention_layer(in_dims, sa_dims, head_dims, k_ph=True, v_ph=True) ff_layer = feed_forward_layer(in_dims, hidden_dims) sa = layer_normalization(X + mhsa_layer(X)) # w/o mask eda = layer_normalization(sa + eda_layer(sa, k_memory, v_memory)) ff = layer_normalization(eda + ff_layer(eda)) result = ff if as_block is True: return C.as_block(result, [(X, X), (k_memory, k_memory), (v_memory, v_memory)], name) else: return result
def test_cosine_distance(): a = np.reshape(np.arange(25.0, dtype=np.float32), (5, 5)) b = np.reshape(np.arange(0, 5, dtype=np.float32), (1, 5)) src = C.sequence.input_variable(shape=(5), sequence_axis=C.Axis("Seq")) tgt = C.input_variable(shape=(5)) tgt_br = C.sequence.broadcast_as(tgt, src) cos_seq = C.cosine_distance(src, tgt_br) assert len(cos_seq.dynamic_axes) == 2 assert cos_seq.dynamic_axes[1].name == "Seq" val = cos_seq.eval({src: [a], tgt: [b]}) expected = [[1., 0.914659, 0.878459, 0.86155, 0.851852]] assert np.allclose(val, expected)
def __init__(self, config_file): data_config = importlib.import_module(config_file).data_config model_config = importlib.import_module(config_file).model_config self.word_count_threshold = data_config['word_count_threshold'] self.char_count_threshold = data_config['char_count_threshold'] self.word_size = data_config['word_size'] self.abs_path = os.path.dirname(os.path.abspath(__file__)) pickle_file = os.path.join(self.abs_path, data_config['pickle_file']) with open(pickle_file, 'rb') as vf: known, self.vocab, self.chars = pickle.load(vf) self.wg_dim = known self.wn_dim = len(self.vocab) - known self.c_dim = len(self.chars) self.a_dim = 1 self.hidden_dim = model_config['hidden_dim'] self.w2v_hidden_dim = model_config['w2v_hidden_dim'] self.convs = model_config['char_convs'] self.dropout = model_config['dropout'] self.char_emb_dim = model_config['char_emb_dim'] self.highway_layers = model_config['highway_layers'] self.two_step = model_config['two_step'] self.use_cudnn = model_config['use_cudnn'] self.use_sparse = True # Source and target inputs to the model inputAxis = C.Axis('inputAxis') outputAxis = C.Axis('outputAxis') InputSequence = C.layers.SequenceOver[inputAxis] OutputSequence = C.layers.SequenceOver[outputAxis] print('dropout', self.dropout) print('use_cudnn', self.use_cudnn) print('use_sparse', self.use_sparse)
def test_LSTM(tmpdir): for config in list( product(use_peepholes_options, enable_self_stabilization_options, initial_state_options, activation_options)): model_filename = MakeLSTMNameFromConfig(*config) use_peepholes, enable_self_stabilization, initial_state, activation = config x = C.input_variable( input_dim, dynamic_axes=[Axis.default_batch_axis(), C.Axis('sequenceAxis')]) LSTMmodel = CreateLSTMModel( peepholes=use_peepholes, activation=activation, initial_state=initial_state, cell_dim=cell_dim, self_stabilization=enable_self_stabilization)(x) data = np.random.uniform(low=0.0, high=1.0, size=(batch_size, sequence_len, input_dim)).astype('f') verify_one_input(LSTMmodel, data, tmpdir, model_filename)
def test_sequence_unpack_with_broadcast_as(device_id, precision): x = C.sequence.input_variable(5) a = C.sequence.input_variable(4, sequence_axis=C.Axis('a')) y, mask = C.sequence.unpack(x, 0).outputs bvm = C.sequence.broadcast_as(0 * C.reduce_sum(y) + mask, a) x1 = [ np.arange(7 * 5).reshape(7, 5).astype('f'), np.arange(3 * 5).reshape(3, 5).astype('f') ] a1 = [ np.arange(3 * 4).reshape(3, 4).astype('f'), np.arange(6 * 4).reshape(6, 4).astype('f') ] expected = [ np.ones((3, 7), dtype=np.float32), np.ones((6, 7), dtype=np.float32) ] expected[1][:, 3:] = 0 actual = bvm.eval({x: x1, a: a1}) for actual_i, expected_i in zip(actual, expected): assert np.allclose(actual_i, expected_i)
def test_lstm_over_lstm_thought_vectors_2(device_id): dev = cntk_device(device_id) input_vocab_size = 3 emb_dim = 2 hidden_dim = 2 num_labels = 2 utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances') conversation_lengths_input = C.input_variable( (), name='conversation_sequence_lengths') label_input = C.sequence.input_variable( num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(utterances_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.user_function( UtteranceBatchReshape(model, conversation_lengths_input)) model = C.to_sequence_like(model, label_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_input) sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu()) c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu()) c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu()) c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu()) c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu()) c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu()) c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu()) all_utt_data = C.Value.create(C.sequence.input_variable( (input_vocab_size), is_sparse=True), [ c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data ], device=C.cpu()).data conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32) seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0]] seq3_label_data = [[1, 0], [0, 1]] label_data = [ _to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data) ] param_grads, loss_result = ce.grad( { utterances_input: all_utt_data, label_input: label_data, conversation_lengths_input: conversation_lengths_data }, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.01 assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance) assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
def test_RNN(tmpdir, dtype): with C.default_options(dtype=dtype): def CreatRNN(cell_dim, activation, initial_state, direction, num_layers, init=C.default_override_or(C.glorot_uniform()), init_bias=C.default_override_or(0)): if direction == 'bidirectional': return C.layers.Sequential([ C.layers.For( range(num_layers), lambda i: [ (C.layers.Recurrence(C.layers.RNNStep( cell_dim, activation=activation, init=init, init_bias=init_bias), initial_state=initial_state, return_full_state=False, go_backwards=False), C.layers.Recurrence(C.layers.RNNStep( cell_dim, activation=activation, init=init, init_bias=init_bias), initial_state=initial_state, return_full_state=False, go_backwards=True)), C.splice ]) ]) else: go_backward = False if direction == 'forward' else True return C.layers.Sequential([ C.layers.For( range(num_layers), lambda i: [ C.layers.Recurrence(C.layers.RNNStep( cell_dim, activation=activation, init=init, init_bias=init_bias), initial_state=initial_state, return_full_state=False, go_backwards=go_backward) ]) ]) def MakeRNNNameFromConfig(direction, num_layers, initial_state, activition): model_name = 'RNN.' + direction + '.' if num_layers == 1: model_name += 'one_layer.' else: assert (num_layers == 2), "needs 1 or 2 layers!" model_name += 'two_layer.' if (initial_state != 0): model_name += 'initial.' model_name += activition.__name__ return model_name direction_options = ['forward', 'reverse', 'bidirectional'] num_layers_options = [1, 2] initial_state_options = [0] activation_options = [C.tanh, C.relu, C.sigmoid] input_dim = 2 hidden_dim = 3 batch_size = 1 sequence_len = 5 for config in list( product(direction_options, num_layers_options, initial_state_options, activation_options)): model_filename = MakeRNNNameFromConfig(*config) print(model_filename) direction, num_layers, initial_state, activation = config x = C.input_variable(input_dim, dynamic_axes=[ C.Axis.default_batch_axis(), C.Axis('sequenceAxis') ]) RNNModel = CreatRNN(hidden_dim, activation, initial_state, direction, num_layers)(x) data = np.random.uniform(low=0.0, high=1.0, size=(batch_size, sequence_len, input_dim)).astype(dtype) verify_one_input(RNNModel, data, tmpdir, model_filename)
def self_attention_layer(in_dims: int, out_dims: int, name='self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims') X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') if k_ph is False and v_ph is False: q = C.layers.Dense(out_dims, name=name + '_q')( X ) # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q') k = C.layers.Dense(out_dims, name=name + '_k')( X ) # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k') v = C.layers.Dense(out_dims, name=name + '_v')( X ) # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v') elif k_ph is True and v_ph is True: q = C.layers.Dense(out_dims, name=name + '_q')(X) k = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') v = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q') k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k') v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v') scores = C.times_transpose(q_, k_, name=name + '_score_matrix') scaled = scores / sq_sa_dims # div_k if mask_opt: mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) softmax = C.softmax(scaled, name=name + '_softmax') attention = C.times(softmax, v_, name=name + '_attention') result = C.to_sequence_like(attention, X) if as_block: if k_ph is False and v_ph is False: return C.as_block(result, [(X, X)], 'self_attention', 'self_attention_') elif k_ph is True and v_ph is True: return C.as_block(result, [(X, X), (k, k), (v, v)], 'self_attention', 'self_attention_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') else: return result
evaluate_decoder(test_reader, model, i2w) # ============= configure ===================== input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 attention_dim = 128 use_attention = True use_embedding = True embedding_dim = 200 length_increase = 1.5 InputSequence = C.layers.SequenceOver[C.Axis('inputAxis')] LabelSequence = C.layers.SequenceOver[C.Axis('labelAxis')] vocab, i2w, _ = get_vocab(dataPath['vocab_file']) train_reader = create_reader(dataPath['training'], True) valid_reader = create_reader(dataPath['validation'], True) sentence_start = C.Constant( np.array([w == '<s>' for w in vocab], dtype=np.float)) sentence_end_idx = vocab.index('</s>') # first </s> if __name__ == '__main__': model = create_model() a = model.find_by_name('encode_h') #x = x.root_function print(a)
def trainNetwork(): mapper, gens = loadData(dir + fileName, './data/Shakespeare', batchSize, timeSteps, timeShift, load=False, lineShape=(0, 40000)) # Input with dynamic sequence axis # consisting of a matrix of [steps-in-time X number-of-possible-characters] inputSeqAxis = cntk.Axis('inputAxis') input = cntk.sequence.input_variable((timeSteps, mapper.numClasses), sequence_axis=inputSeqAxis, name='input') model = createNetwork(input, layers, mapper.numClasses) label = cntk.sequence.input_variable(mapper.numClasses, sequence_axis=inputSeqAxis, name='label') z = model(input) loss = cntk.cross_entropy_with_softmax(z, label) error = cntk.classification_error(z, label) printer = cntk.logging.ProgressPrinter(tag='Training', freq=100, num_epochs=maxEpochs) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.001) momentum_schedule = cntk.momentum_schedule_per_sample(0.9990913221888589) learner = cntk.momentum_sgd(z.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=5.0, gradient_clipping_with_truncation=True) #learner = cntk.momentum_sgd(z.parameters, lr, 0.9, minibatch_size=batchSize) #learner = cntk.fsadagrad(model.parameters, lr=lr, minibatch_size=batchSize, momentum=0.9, unit_gain=True) trainer = cntk.Trainer(z, (loss, error), learner, [printer]) numMinibatch = mapper.samples // batchSize print("Input sequence length: {}; unique characters {};".format( timeSteps, mapper.numClasses)) cntk.logging.log_number_of_parameters(z) print("Datset size {}; {} Epochs; {} minibatches per epoch".format( mapper.samples, maxEpochs, numMinibatch)) for epoch in range(maxEpochs): mask = [True] for mb in range(numMinibatch): X, Y = next(gens['train']) #X, Y = get_data(mb, batchSize, data, mapper) arguments = ({input: X, label: Y}, mask) mask = [False] trainer.train_minibatch(arguments) if mb % 100 == 0: print(generateText(z, mapper, 200) + '\n') trainer.summarize_training_progress() print(generateText(z, mapper, 100))
def test_axis(): a = C.Axis(1) assert isinstance(a.is_static_axis, bool) assert a.is_static_axis == True assert a.static_axis_index() == 1
return pos_encoding #endregion if __name__ == '__main__': VOCAB_DIMS = 100 # size of vocabulary TOKEN_DIMS = 4 # size of tokens (# of embedding) SA_DIMS = 3 # size of self attention HEAD_DIMS = 8 # size of multi-headed self attention HIDDEN_DIMS = 24 # feed forward layer hidden v = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [0, 1, 0, 0]], np.float32) # seq X = C.sequence.input_variable(TOKEN_DIMS, name='encoder_input', sequence_axis=C.Axis('encoder_seq')) #region encoder model encoder_model = encoder(TOKEN_DIMS, SA_DIMS, HEAD_DIMS, HIDDEN_DIMS, as_block=False)(X) print(encoder_model.eval({encoder_model.arguments[0]: v})) #endregion #region encoder-decoder model input_size = 6 y = np.array(range(TOKEN_DIMS * input_size), np.float32).reshape(input_size, TOKEN_DIMS) Y = C.sequence.input_variable(
def test_lstm_over_lstm_thought_vectors(device_id): dev = cntk_device(device_id) input_vocab_size = 3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable( (C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input_variable( num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad( { x_seq_input: x_seq_data, label_seq_input: label_seq_data }, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors(device_id): dev = cntk_device(device_id) input_vocab_size = 3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable( (C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input_variable( num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad( { x_seq_input: x_seq_data, label_seq_input: label_seq_data }, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() # TODO: The tolerance here is inordinately high due to the non-determinism in initialization # of parameters as the individual tests are not run in separate processes resulting in the # addition or removal of tests to affect the random initialization of parameters in all other # tests that do not explicitly specify the random seed. The tolerance should be lowered to # 0.01 after this issue in the test infrastructure has been fixed. absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.63504], [0.673343], [0.698446]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.772344], [0.64295]], atol=absolute_tolerance)