def true_density(z): z1, z2 = z[0], z[1] norm = C.sqrt(C.square(z1) + C.square(z2)) exp1 = C.exp(-0.5 * C.square((z1 - 2) / 0.8)) exp2 = C.exp(-0.5 * C.square((z1 + 2) / 0.8)) u = 0.5 * C.square(((norm - 4) / 0.4)) - C.log(exp1 + exp2) return C.exp(-u)
def instance_normalization(x): mean = C.reduce_mean(x, axis=(1, 2)) x0 = x - mean std = C.sqrt(C.reduce_mean(x0 * x0, axis=(1, 2))) if epsilon != 0: std += epsilon x_hat = x0 / std return x_hat * C.reshape(scale, (-1, 1, 1)) + C.reshape(bias, (-1, 1, 1))
def test_grad_custimized_root(): x = C.input_variable(shape=(1,), needs_gradient=True) y = C.sqrt(x) y2 = C.log(x) combine = C.combine([y.output, y2.output]) a = np.asarray([1,4,16], dtype=np.float32).reshape(3,1) grads = combine.grad({x:a}, grad_root = y.output) expect_grad = np.asarray([[0.5],[0.25],[0.125]], dtype=np.float32) assert np.array_equal(grads, expect_grad)
def test_grad_custimized_root(): x = C.input(shape=(1, ), needs_gradient=True) y = C.sqrt(x) y2 = C.log(x) combine = C.combine([y.output, y2.output]) a = np.asarray([1, 4, 16], dtype=np.float32).reshape(3, 1) grads = combine.grad({x: a}, grad_root=y.output) expect_grad = np.asarray([[0.5], [0.25], [0.125]], dtype=np.float32) assert np.array_equal(grads, expect_grad)
def squash(input): # ||Sj||^2 Sj_squared_norm = ct.reduce_sum(ct.square(input), axis=axis) # ||Sj||^2 / (1 + ||Sj||^2) * (Sj / ||Sj||) factor = ct.element_divide( ct.element_divide(Sj_squared_norm, ct.plus(1, Sj_squared_norm)), ct.sqrt(ct.plus(Sj_squared_norm, epsilon))) return factor * input
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def attention(query, key, value): dk = C.sqrt(C.reduce_sum(C.ones_like( query))) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack( key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack( value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as( unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack( scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant( np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes( valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError( "max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as( unpacked_value, query)) # [#, *] [value_dim,] return attended
def layer_normalization(inputs: C.Function, name='layer_normalization') -> C.Function: X = C.placeholder( inputs.shape, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') mu = C.reduce_mean(X, name='mu') sigma = C.sqrt(C.reduce_mean(C.square(X - mu)), name='sigma') result = (X - mu) / sigma #region scale + bias scale = C.parameter(inputs.shape, init=1, name='scale') bias = C.parameter(inputs.shape, init=0, name='bias') result = result * scale + bias #endregion block = C.as_block(result, [(X, X)], name) return block(inputs)
def sqrt(x, name=''): ''' Computes the element-wise square-root of `x`: :math:`sqrt(x) = {\sqrt[2]{x}}` Example: >>> C.eval(C.sqrt([0., 4.])) [array([[ 0. , 2.]])] Args: x: numpy array or any :class:`cntk.Function` that outputs a tensor name (str): the name of the node in the network Returns: :class:`cntk.Function` Note: CNTK returns zero for sqrt of negative nubmers, this will be changed to retrun NaN ''' from cntk import sqrt x = sanitize_input(x) return sqrt(x, name).output()
def test_sqrt(): assert_cntk_ngraph_isclose(C.sqrt([0., 4.])) assert_cntk_ngraph_isclose(C.sqrt([[1, 2], [3, 4]])) assert_cntk_ngraph_isclose(C.sqrt([[[1, 2], [3, 4]], [[1, 2], [3, 4]]]))
# Create CNTK inputs input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) def create_model(features): with C.layers.default_options(init=C.glorot_uniform()): r = C.layers.Dense(num_output_classes, activation=None)(features) return r # Scale the input to 0-1 range by dividing each pixel by 255 # z represents the output of the network -> z = Wx' + b input_s = input / 255 squared_input = C.square(input_s) sqrted_input = C.sqrt(input_s) normalized_input = C.splice(input_s, squared_input, sqrted_input) z = create_model(normalized_input) # Define loss to minimize the cross-entropy between the label and predicted # probability by the network loss = C.cross_entropy_with_softmax(z, label) # Define the evaluation (metric) function to report how well our model is performing label_error = C.classification_error(z, label) # Instantiate the trainer object to drive the model training learning_rate = 0.2 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(z.parameters, lr_schedule)
def self_attention_layer(in_dims: int, out_dims: int, name='self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims') X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') if k_ph is False and v_ph is False: q = C.layers.Dense(out_dims, name=name + '_q')( X ) # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q') k = C.layers.Dense(out_dims, name=name + '_k')( X ) # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k') v = C.layers.Dense(out_dims, name=name + '_v')( X ) # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v') elif k_ph is True and v_ph is True: q = C.layers.Dense(out_dims, name=name + '_q')(X) k = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') v = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q') k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k') v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v') scores = C.times_transpose(q_, k_, name=name + '_score_matrix') scaled = scores / sq_sa_dims # div_k if mask_opt: mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) softmax = C.softmax(scaled, name=name + '_softmax') attention = C.times(softmax, v_, name=name + '_attention') result = C.to_sequence_like(attention, X) if as_block: if k_ph is False and v_ph is False: return C.as_block(result, [(X, X)], 'self_attention', 'self_attention_') elif k_ph is True and v_ph is True: return C.as_block(result, [(X, X), (k, k), (v, v)], 'self_attention', 'self_attention_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') else: return result
def test_Sqrt(tmpdir, dtype): with C.default_options(dtype = dtype): model = C.sqrt(np.array([0., 4.]).astype(dtype)) verify_no_input(model, tmpdir, 'Sqrt_0')
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False): chunk = {} log_det_J = 0 chunk['input_dim'] = input_dim _ph = C.placeholder(input_dim, name='place_holder') _out = _ph if batch_norm: # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph) # chunk['scale'] = _bn.parameters[0] # chunk['bias'] = _bn.parameters[1] chunk['mu'] = C.Constant(np.zeros(shape=input_dim)) chunk['var'] = C.Constant(np.ones(shape=input_dim)) _eps = C.Constant(1e-7) _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis()) _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis()) chunk['muB'] = _mu chunk['varB'] = _var # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps) _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu'] _ph = _bn log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps))) # log_det_J += C.reduce_sum(C.log()) chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim)) _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim) # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0) _out = _ph@_W log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1] _half_dim = input_dim//2 _x1 = _out[:_half_dim] _x2 = _out[_half_dim:] _log_s_func, _t_func = act_func_pair if _log_s_func is None: # basic network _log_s_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim, C.tanh), ])#(C.placeholder(input_dim, name='place_holder')) if _t_func is None: # basic network _t_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim), ])#(C.placeholder(input_dim, name='place_holder')) chunk['log_s_func'] = _log_s_func chunk['t_func'] = _t_func _log_s, _t = _log_s_func(_x2), _t_func(_x2) _s = C.exp(_log_s) _y1 = _s*_x1 + _t _y2 = _x2 _Y = C.splice(_y1, _y2) chunk['output'] = _Y log_det_J += C.reduce_sum(_log_s) return _Y, log_det_J, chunk
def test_Sqrt(tmpdir): model = C.sqrt([0., 4.]) verify_no_input(model, tmpdir, 'Sqrt_0')
# Define the layer dimensions num_hidden_layers = 2 hidden_layers_dim = 400 def create_model(features): with cntk.layers.default_options(init = cntk.glorot_uniform(), activation = cntk.ops.relu): input = features for _ in range(num_hidden_layers): input = cntk.layers.Dense(hidden_layers_dim)(input) r = cntk.layers.Dense(num_output_classes, activation = None)(input) return r # Scale the input to 0-1 range by dividing each pixel by 255. input_s_normalized = input/255.0 input_s_squared = cntk.square(input_s_normalized) input_s_sqrt = cntk.sqrt(input_s_normalized) z_model = create_model(input_s_normalized) # Define the loss function for is_training loss = cntk.cross_entropy_with_softmax(z_model, label) # Classification error evaluation label_error = cntk.classification_error(z_model, label) # Configure training parameters # Instantiate the trainer object to drive the model training learning_rate = 0.2 lr_schedule = cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch) # Schoastic Gradient Descent learner learner = cntk.sgd(z_model.parameters, lr_schedule)
if not os.path.exists(data_dir): data_dir = os.path.join("data", "MNIST") print('Writing train text file...') savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train) print('Writing test text file...') savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test) print('Done') input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) normalize_input = input / 255.0 squared_input = C.square(input / 255.0) sqrt_input = C.sqrt(input / 255.0) z = create_model(C.splice(normalize_input, squared_input, sqrt_input)) loss = C.cross_entropy_with_softmax(z, label) label_error = C.classification_error(z, label) lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner]) data_found = False
def length(input): return ct.reshape( ct.sqrt(ct.reduce_sum(ct.square(input), axis=1) + epsilon), (10, 1))
def test_Sqrt(tmpdir, dtype): with C.default_options(dtype=dtype): model = C.sqrt(np.array([0., 4.]).astype(dtype)) verify_no_input(model, tmpdir, 'Sqrt_0')
#%% input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) #%% def create_model(features): with C.layers.default_options(init=C.glorot_uniform()): r = C.layers.Dense(num_output_classes, activation=None)(features) return r #%% # Scale the input to 0-1 range by dividing each pixel by 255. input_s = input / 255 input_s = C.splice(input_s, C.sqrt(input_s), C.square(input_s)) z = create_model(input_s) #%% loss = C.cross_entropy_with_softmax(z, label) #%% label_error = C.classification_error(z, label) #%% # Instantiate the trainer object to drive the model training learning_rate = 0.2 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner])
def gpt2_self_attention(token_dims: int, head_dims: int, mask_opt: bool = False, as_block: bool = False, name: str = 'self_attention'): X = C.placeholder(token_dims, dynamic_axes=(C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name) # q = C.layers.Dense(token_dims, name=name+'_q')(X) # k = C.layers.Dense(token_dims, name=name+'_k')(X) # v = C.layers.Dense(token_dims, name=name+'_v')(X) # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w') # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv') qkv = C.layers.Dense((3, token_dims), name='qkv')(X) q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2] q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q') k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k') v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v') #region split multi head attention q_heads = [ C.squeeze(q_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] k_heads = [ C.squeeze(k_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] v_heads = [ C.squeeze(v_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] #endregion attention_head = [] for i in range(head_dims): q = q_heads[i] k = k_heads[i] v = v_heads[i] #region score # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i)) # q seq k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i)) # k seq v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i)) # v seq scores = C.times_transpose(q_, k_) scaled = scores * (1 / C.sqrt(v_.shape[-1])) #region mask opt mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) #endregion softmax = C.softmax(scaled) #endregion #region sum attention = C.times(softmax, v_) attention_seq = C.to_sequence_like(attention, X) #endregion attention_head.append(attention_seq) #region merge attention heads attention = C.splice(*attention_head, name='merged_attention') #endergion #region project project = C.layers.Dense(token_dims, name='project')(attention) #endregion if as_block: return C.as_block(project, [(X, X)], 'gpt2_self_attention', 'gpt2_self_attention') return project