def test_squeeze(operand_shape, axis, device_id, precision): operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f') expected = np.squeeze(operand, axis) expected_forward = [expected] expected_backward = { 'arg': [np.ones_like(operand)], } from .. import squeeze, placeholder p = C.placeholder() squeeze_with_axis = C.squeeze(p, axis) _test_unary_op(precision, device_id, squeeze_with_axis, operand, expected_forward, expected_backward)
def model(query, key, value): q = phi(query_linear(query)) k = phi(key_linear(key)) v = value_linear(value) # key and value should have the same sequence length k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True) # k_unpacked: [#] [*kv=, model_dim] v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True) # v_unpacked: [#] [*kv=, hidden_dim] kv = C.times(C.swapaxes(k_unpacked), v_unpacked) # kv [#] [model_dim, hidden_dim] kv_broadcasted = C.sequence.broadcast_as(kv, q) # this can be reused across queries # kv [#, *] [model_dim, hidden_dim] numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted)) # numerator [#, *] [hidden_dim, ] denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q)) # denom [#, *] [1] return numerator / denom
def __init__(self, p, eps=1e-7): if isinstance(p, (C.Variable, C.Function)): self.p = C.squeeze(p) else: self.p = C.Constant(np.squeeze(p)) self.eps = C.Constant(eps, name='eps') self.c = self.p.shape[0] self.prob = self.p / (self.eps + C.reduce_sum(self.p)) self.logits = C.log(self.prob) self.accum_prob = self.prob @ C.Constant( (1 - np.tri(self.prob.shape[-1], k=-1))) p_log_p = self.logits * self.prob self._entropy = -C.reduce_sum(p_log_p) dist = C.input_variable(1, name='category index') # method 1 self._log_prob = C.log( C.reduce_sum(self.prob * C.one_hot(dist, self.c)))
def inner(a): # a: [#, *] [static_axes, num_classes] k_values, k_indices = C.top_k(a, k=k, axis=axis).outputs # k_indices [#, *] [static_axes, k] b = C.one_hot(k_indices, num_classes) # b: [#, *] [static_axes, k, num_classes] valid_probabilities = C.squeeze(C.reduce_sum(b, axis=-2), axes=(-2, )) # valid_probabilities: [#, *] [static_axes, num_classes] # k largest probabilies are retained, everything else is set to -inf and will not be sampled minus_inf = C.constant(-1e+30) d = a * valid_probabilities e = C.element_select(d, d, minus_inf) # e: [#, *] [static_axes, num_classes] # sample from top_k distribution once s = sample(e, axis=axis, name=name) # s: [#, *] [static_axes, num_classes] return s
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as( C.reshape(u_valid, (1, ), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def crossentropy(y, t): prob = C.squeeze(C.reduce_sum(y * t, axis=0), 0) return -C.reduce_mean(C.unpack_batch(C.log(prob)))
def forward(x): y = C.times(x, theta1) + C.squeeze(bias1, 0) y = C.element_max(y, 0.) return C.times(y, theta2) + C.squeeze(bias2, 0)
def crossentropy(y, t): prob = C.squeeze(C.reduce_sum(y * t, axis=0), 0) return -C.reduce_mean(C.unpack_batch(C.log(prob))) y = crossentropy(softmax(forward(x)), t) batch_size = 20 for i in range(min(dataset_size, 100000) // batch_size): lr = 0.5 * (.1**(max(i - 100, 0) // 1000)) sample = X[batch_size * i:batch_size * (i + 1)] target = labels[batch_size * i:batch_size * (i + 1)] g = y.grad({x: sample, t: target}, wrt=[theta1, bias1, theta2, bias2]) for param, grad in g.items(): param.value = param.value - grad * lr loss = y.eval({x: sample, t: target}) print("cost {} - learning rate {}".format(loss, lr)) y = C.squeeze(C.argmax(forward(x), 0), 0) accuracy = 0 for i in range(1000): sample = X[batch_size * i:batch_size * (i + 1)] target = labels[batch_size * i:batch_size * (i + 1)] tt = y.eval({x: sample}) accuracy += np.sum(tt == np.argmax(target, axis=1)) print("Accuracy", accuracy / 1000. / batch_size) # accuracy 99.36
def _(t, batch_dim=0): return cntk.squeeze(cntk.flatten(t, axis=batch_dim))
def inner(a): b = C.reshape(a, (n, -1)) return tuple(C.squeeze(b[i]) for i in range(n))
def sample(self, n=1): samples = C.random.uniform((n, 1)) indcies = C.argmax(C.greater(self.accum_prob - samples, 0), axis=1) return C.squeeze(indcies)
def gpt2_self_attention(token_dims: int, head_dims: int, mask_opt: bool = False, as_block: bool = False, name: str = 'self_attention'): X = C.placeholder(token_dims, dynamic_axes=(C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name) # q = C.layers.Dense(token_dims, name=name+'_q')(X) # k = C.layers.Dense(token_dims, name=name+'_k')(X) # v = C.layers.Dense(token_dims, name=name+'_v')(X) # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w') # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv') qkv = C.layers.Dense((3, token_dims), name='qkv')(X) q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2] q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q') k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k') v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v') #region split multi head attention q_heads = [ C.squeeze(q_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] k_heads = [ C.squeeze(k_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] v_heads = [ C.squeeze(v_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] #endregion attention_head = [] for i in range(head_dims): q = q_heads[i] k = k_heads[i] v = v_heads[i] #region score # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i)) # q seq k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i)) # k seq v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i)) # v seq scores = C.times_transpose(q_, k_) scaled = scores * (1 / C.sqrt(v_.shape[-1])) #region mask opt mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) #endregion softmax = C.softmax(scaled) #endregion #region sum attention = C.times(softmax, v_) attention_seq = C.to_sequence_like(attention, X) #endregion attention_head.append(attention_seq) #region merge attention heads attention = C.splice(*attention_head, name='merged_attention') #endergion #region project project = C.layers.Dense(token_dims, name='project')(attention) #endregion if as_block: return C.as_block(project, [(X, X)], 'gpt2_self_attention', 'gpt2_self_attention') return project