def net(X): X = X.reshape((-1, num_inputs)) h = relu(nd.dot(X, W1) + b1) h1 = relu(nd.dot(h, W2) + b2) h2 = relu(nd.dot(h1, W3) + b3) output = nd.dot(h2, W4) + b4 return output
def symeig_svd(self, matrix, n_eigenvecs=None): """Computes a truncated SVD on `matrix` using symeig Uses symeig on matrix.T.dot(matrix) or its transpose Parameters ---------- matrix : 2D-array n_eigenvecs : int, optional, default is None if specified, number of eigen[vectors-values] to return Returns ------- U : 2D-array of shape (matrix.shape[0], n_eigenvecs) contains the right singular vectors S : 1D-array of shape (n_eigenvecs, ) contains the singular values of `matrix` V : 2D-array of shape (n_eigenvecs, matrix.shape[1]) contains the left singular vectors """ # Check that matrix is... a matrix! if self.ndim(matrix) != 2: raise ValueError('matrix be a matrix. matrix.ndim is %d != 2' % self.ndim(matrix)) dim_1, dim_2 = self.shape(matrix) if dim_1 <= dim_2: min_dim = dim_1 max_dim = dim_2 else: min_dim = dim_2 max_dim = dim_1 if n_eigenvecs is None: n_eigenvecs = max_dim if min_dim <= n_eigenvecs: if n_eigenvecs > max_dim: warnings.warn( 'Trying to compute SVD with n_eigenvecs={0}, which ' 'is larger than max(matrix.shape)={1}. Setting ' 'n_eigenvecs to {1}'.format(n_eigenvecs, max_dim)) n_eigenvecs = max_dim # we compute decomposition on the largest of the two to keep more eigenvecs dim_1, dim_2 = dim_2, dim_1 if dim_1 < dim_2: U, S = nd.linalg.syevd(dot(matrix, transpose(matrix))) S = self.sqrt(S) V = dot(transpose(matrix), U / reshape(S, (1, -1))) else: V, S = nd.linalg.syevd(dot(transpose(matrix), matrix)) S = self.sqrt(S) U = dot(matrix, V) / reshape(S, (1, -1)) U, S, V = U[:, ::-1], S[::-1], transpose(V)[::-1, :] return U[:, :n_eigenvecs], S[:n_eigenvecs], V[:n_eigenvecs, :]
def function_set(self): # 第一层卷积 # 卷积 h1_conv = nd.Convolution( data=self.__batch_X, weight=self.__W1, bias=self.__b1, kernel=self.__W1.shape[2:], num_filter=self.__W1.shape[0]) # 激活 h1_activation = nd.relu(h1_conv) # 池化 h1 = nd.Pooling(data=h1_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) # 第二层卷积 h2_conv = nd.Convolution( data=h1, weight=self.__W2, bias=self.__b2, kernel=self.__W2.shape[2:], num_filter=self.__W2.shape[0]) h2_activation = nd.relu(h2_conv) h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) h2 = nd.flatten(h2) # 第一层全连接 h3_linear = nd.dot(h2, self.__W3) + self.__b3 h3 = nd.relu(h3_linear) # 第二层全连接 h4_linear = nd.dot(h3, self.__W4) + self.__b4 # print("1st conv block:", h1.shape) # print("2nd conv block:", h2.shape) # print("1st dense:", h3.shape) # print("2nd dense:", h4_linear.shape) # print("output:", h4_linear) return h4_linear
def def_grads(prims): """ Define gradient function for primitives """ identity = lambda x: x # dot prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(g, b.T)) prims('dot').def_grad( lambda ans, a, b: lambda g: ndarray.dot(a.T, g), argnum=1) # non-linear prims('tanh').def_grad(lambda ans, x: lambda g: g * (1 - ans ** 2)) prims('exp').def_grad(lambda ans, x: lambda g: g * ans) prims('log').def_grad(lambda ans, x: lambda g: g / x) # reduce prims('sum').def_grad(_sum_grad) # + - * / prims('multiply').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g * y)) prims('multiply').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: x * g), argnum=1) prims('add').def_grad(lambda ans, x, y: _unbroadcast(ans, x, identity)) prims('add').def_grad( lambda ans, x, y: _unbroadcast(ans, y, identity), argnum=1) prims('subtract').def_grad( lambda ans, x, y: _unbroadcast(ans, x, identity)) prims('subtract').def_grad( lambda ans, x, y: _unbroadcast(ans, y, operator.neg), argnum=1) prims('divide').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g / y)) prims('divide').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: -g * x / (y * y)), argnum=1) prims('true_divide').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g / y)) prims('true_divide').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: -g * x / (y * y)), argnum=1) prims('maximum').def_grad(_maximum_grad_gen0) prims('maximum').def_grad(_maximum_grad_gen1, argnum=1) # TODO: minjie prims('max').def_grad_zero() # negate prims('negative').def_grad(lambda ans, x: operator.neg) prims('transpose').def_grad(lambda ans, x: mxnet.nd.transpose) prims('abs').def_grad(lambda ans, x: lambda g: mxnet.nd.sign(x) * g) prims('sign').def_grad_zero() prims('round').def_grad_zero() prims('ceil').def_grad_zero() prims('floor').def_grad_zero() prims('sqrt').def_grad(lambda ans, x: lambda g: g * 0.5 / mxnet.nd.sqrt(x)) prims('sin').def_grad(lambda ans, x: lambda g: g * mxnet.nd.cos(x)) prims('cos').def_grad(lambda ans, x: lambda g: -g * mxnet.nd.sin(x)) prims('power').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g * y * mxnet.nd.power(x, y - 1)) ) prims('power').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: g * mxnet.nd.log(x) * ans), argnum=1) prims('reshape').def_grad( lambda _0, x, _1: lambda g: NDArray.reshape(g, x.shape)) prims('expand_dims').def_grad( lambda ans, x, axis: lambda g: NDArray.reshape(g, x.shape))
def function_set(self): def dropout(batch_X, drop_probability): keep_probability = 1 - drop_probability assert 0 <= keep_probability <= 1 if keep_probability == 0: return batch_X.zeros_like() # > 保存的概率才能够保留该样本该神经元的输出 mask = nd.random_uniform( 0, 1.0, batch_X.shape, ctx=batch_X.context) < keep_probability # 保证 E[dropout(batch_X)] == batch_X scale = 1 / keep_probability return mask * batch_X * scale # Dense 需要 dropout Conv 其实不需要因为已经 share weight 了 h1 = dropout( nd.relu( nd.dot(self.__batch_X.reshape( (-1, self.__num_inputs)), self.__W1) + self.__b1), self.__drop_prob1) h2 = dropout(nd.relu(nd.dot(h1, self.__W2) + self.__b2), self.__drop_prob2) return nd.dot(h2, self.__W3) + self.__b3
def net(X): # -1表示行的大小自动推断,列的大小为nun_inputs X = X.reshape((-1, num_inputs)) # 隐藏层的数据结果 hidden1 = relu(nd.dot(X, W1) + b1) output = nd.dot(hidden1, W2) + b2 return output
def check_KL(self): ph_act = nd.dot(self.enum_states, self.W) + self.hb vt = nd.dot(self.enum_states, self.vb) ht = nd.sum(-nd.log(nd.sigmoid(-ph_act)), axis=1) p_th = nd.softmax(vt + ht) KL = nd.sum(self.prob_states * nd.log(self.prob_states / p_th)) return KL.asnumpy()[0]
def net(X, verbose=False): X = X.as_in_context(W1.context) # 第一层卷积 h1_conv = nd.Convolution(data=X, weight=W1, bias=b1, kernel=W1.shape[2:], num_filter=W1.shape[0]) h1_activation = nd.relu(h1_conv) h1 = nd.Pooling(data=h1_activation, pool_type='max', kernel=(2, 2), stride=(2, 2)) # 第二层卷积 h2_conv = nd.Convolution(data=h1, weight=W2, bias=b2, kernel=W2.shape[2:], num_filter=W2.shape[0]) h2_activation = nd.relu(h2_conv) h2 = nd.Pooling(h2_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) h2 = nd.flatten(h2) # 第一层全连接 h3_linear = nd.dot(h2, W3) + b3 h3 = nd.relu(h3_linear) # 第二层全连接 h4_linear = nd.dot(h3, W4) + b4 if verbose: print('1st conv block', h1.shape) print('2nd conv block', h2.shape) print('1st conv block', h3.shape) print('2nd conv block', h4_linear.shape) print('output:', h4_linear) return h4_linear
def def_grads(reg, prims): def identity(x): return x # dot prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(g, b.T)) prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(a.T, g), argnum=1) # non-linear #prims.tanh.def_grad(lambda ans, x: lambda g: g / np.cosh(x) ** 2) prims('exp').def_grad(lambda ans, x: lambda g: g * ans) prims('log').def_grad(lambda ans, x: lambda g: g / x) # reduce prims('sum').def_grad(lambda ans, x, axis=None, keepdims=False: gen_sum_grad(ans, x, axis, keepdims)) # + - * / prims('multiply').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g * y)) prims('multiply').def_grad(lambda ans, x, y: unbroadcast(ans, y, lambda g: x * g), argnum=1) prims('add').def_grad(lambda ans, x, y: unbroadcast(ans, x, identity)) prims('add').def_grad(lambda ans, x, y: unbroadcast(ans, y, identity), argnum=1) prims('subtract').def_grad(lambda ans, x, y: unbroadcast(ans, x, identity)) prims('subtract').def_grad(lambda ans, x, y: unbroadcast(ans, y, operator.neg), argnum=1) prims('divide').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g / y)) prims('divide').def_grad( lambda ans, x, y: unbroadcast(ans, y, lambda g: - g * x / (y * y)), argnum=1) prims('true_divide').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g / y)) prims('true_divide').def_grad( lambda ans, x, y: unbroadcast(ans, y, lambda g: - g * x / (y * y)), argnum=1) # power #prims.power.def_grad(lambda ans, x, y : unbroadcast(ans, x, lambda g : g * y * x ** (y - 1))) #prims.power.def_grad(lambda ans, x, y : unbroadcast(ans, y, lambda g : g * ndarray.log(x) * x ** y), argnum=1) # mod #prims.mod.def_grad(lambda ans, x, y : unbroadcast(ans, x, identity)) #prims.mod.def_grad(lambda ans, x, y : unbroadcast(ans, y, lambda g : - g * ndarray.floor(x/y)), argnum=1) # negate prims('negative').def_grad(lambda ans, x: operator.neg)
def def_grads(prims): """ Define gradient function for primitives """ identity = lambda x: x # dot prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(g, b.T)) prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(a.T, g), argnum=1) # non-linear #prims.tanh.def_grad(lambda ans, x: lambda g: g / np.cosh(x) ** 2) prims('exp').def_grad(lambda ans, x: lambda g: g * ans) prims('log').def_grad(lambda ans, x: lambda g: g / x) # reduce prims('sum').def_grad(_sum_grad) # + - * / prims('multiply').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g * y)) prims('multiply').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: x * g), argnum=1) prims('add').def_grad(lambda ans, x, y: _unbroadcast(ans, x, identity)) prims('add').def_grad(lambda ans, x, y: _unbroadcast(ans, y, identity), argnum=1) prims('subtract').def_grad( lambda ans, x, y: _unbroadcast(ans, x, identity)) prims('subtract').def_grad( lambda ans, x, y: _unbroadcast(ans, y, operator.neg), argnum=1) prims('divide').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g / y)) prims('divide').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: -g * x / (y * y)), argnum=1) prims('true_divide').def_grad( lambda ans, x, y: _unbroadcast(ans, x, lambda g: g / y)) prims('true_divide').def_grad( lambda ans, x, y: _unbroadcast(ans, y, lambda g: -g * x / (y * y)), argnum=1) prims('maximum').def_grad(_maximum_grad_gen0) prims('maximum').def_grad(_maximum_grad_gen1, argnum=1) # TODO: minjie prims('max').def_grad_zero() # negate prims('negative').def_grad(lambda ans, x: operator.neg) prims('transpose').def_grad(lambda ans, x: mxnet.nd.transpose) prims('abs').def_grad(lambda ans, x: lambda g: mxnet.nd.sign(x) * g) prims('sign').def_grad_zero() prims('round').def_grad_zero() prims('ceil').def_grad_zero() prims('floor').def_grad_zero() prims('sqrt').def_grad(lambda ans, x: lambda g: g * 0.5 / mxnet.nd.sqrt(x)) prims('sin').def_grad(lambda ans, x: lambda g: g * mxnet.nd.cos(x)) prims('cos').def_grad(lambda ans, x: lambda g: -g * mxnet.nd.sin(x)) prims('power').def_grad(lambda ans, x, y: _unbroadcast( ans, x, lambda g: g * y * mxnet.nd.power(x, y - 1))) prims('power').def_grad(lambda ans, x, y: _unbroadcast( ans, y, lambda g: g * mxnet.nd.log(x) * ans), argnum=1) prims('reshape').def_grad( lambda _0, x, _1: lambda g: NDArray.reshape(g, x.shape)) prims('expand_dims').def_grad( lambda ans, x, axis: lambda g: NDArray.reshape(g, x.shape))
def net(x): x = x.reshape((-1, num_inputs)) h1 = relu(nd.dot(x, w1) + b1) h1 = dropout(h1, drou_prop1) h10 = relu(nd.dot(h1, w10) + b10) h10 = dropout(h10, drou_prop2) output = nd.dot(h10, w2) + b2 return output
def function_set(self): def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): assert len(X.shape) in (2, 4) # 全连接: batch_size x feature if len(X.shape) == 2: # 每个输入维度在样本上的平均和方差 mean = X.mean(axis=0) variance = ((X - mean) ** 2).mean(axis=0) # 2D卷积: batch_size x channel x height x width else: # 对每个通道算均值和方差,需要保持 4D 形状使得可以正确的广播 mean = X.mean(axis=(0, 2, 3), keepdims=True) variance = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True) # 变形使得可以正确的广播 moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) # 均一化 if is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) # !!! 更新全局的均值和方差 # 每一个 batch_X 都会使用上个 batch_X 的 0.9 与 这个 batch_X 的 0.1 moving_mean[:] = moving_momentum * moving_mean + (1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + (1.0 - moving_momentum) * variance else: # !!! 测试阶段使用全局的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) # 拉升和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape) # 第一层卷积 h1_conv = nd.Convolution( data=self.__batch_X, weight=self.__W1, bias=self.__b1, kernel=(5, 5), num_filter=20) # 第一个 BN h1_bn = batch_norm( h1_conv, self.__gamma1, self.__beta1, self.__is_training, self.__moving_mean1, self.__moving_variance1) h1_activation = nd.relu(h1_bn) h1 = nd.Pooling( data=h1_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) # 第二层卷积 h2_conv = nd.Convolution( data=h1, weight=self.__W2, bias=self.__b2, kernel=(3, 3), num_filter=50) # 第二个 BN h2_bn = batch_norm( h2_conv, self.__gamma2, self.__beta2, self.__is_training, self.__moving_mean2, self.__moving_variance2) h2_activation = nd.relu(h2_bn) h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) h2 = nd.flatten(h2) # 第一层全连接 h3_linear = nd.dot(h2, self.__W3) + self.__b3 h3 = nd.relu(h3_linear) # 第二层全连接 h4_linear = nd.dot(h3, self.__W4) + self.__b4 return h4_linear
def rnn(inputs, state, *params): H = state W_xh, W_hh, b_h, W_hy, b_y = params outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return (outputs, H)
def rnn(inputs,state,*params): H = state W_xh,W_hh,b_h,W_hy,b_y = params outputs = [] for X in inputs: H = nd.tanh(nd.dot(X,W_xh) + nd.dot(H,W_hh) + b_h) Y = nd.dot(H,W_hy) + b_y outputs.append(Y) return (outputs,H)
def network(self, X=None, debug=False,): filters, kernels, stride, padding, dilate = self.conv_params['num_filter'], self.conv_params['kernel'], \ self.conv_params['stride'], self.conv_params['padding'], self.conv_params['dilate'] type_pool, kernels_pool, stride_pool, padding_pool, dilate_pool = self.pool_params['pool_type'], \ self.pool_params['kernel'], self.pool_params['stride'], \ self.pool_params['padding'], self.pool_params['dilate'] act_type = self.act_params['act_type'] hidden_dim = self.fc_params['hidden_dim'] # CNN ########################################################################################################## convlayer_out = X interlayer = [] for i, (nf, k, S, P, D, t_p, k_p, S_p, P_p, D_p, a) in enumerate(zip(filters, kernels, stride, padding, dilate, type_pool, kernels_pool, stride_pool, padding_pool, dilate_pool, act_type)): W, b = self.params['W{:d}'.format(i+1,)], self.params['b{:d}'.format(i+1,)] convlayer_out = nd.Convolution(data = convlayer_out, weight=W, bias=b, kernel=k, num_filter=nf, stride=S, dilate=D) convlayer_out = activation(convlayer_out, act_type = a) convlayer_out = nd.Pooling(data=convlayer_out, pool_type=t_p, kernel=k_p, stride=S_p, pad=P_p) interlayer.append(convlayer_out) i_out = i if debug: print("layer{:d} shape: {}".format(i+1, convlayer_out.shape)) # MLP ########################################################################################################## FClayer_out = nd.flatten(convlayer_out) interlayer.append(FClayer_out) if debug: print("After Flattened, Data shape: {}".format(FClayer_out.shape)) for j, (hd, a) in enumerate(zip(hidden_dim, act_type[-len(hidden_dim):])): W, b = self.params['W{:d}'.format(j+i_out+2,)], self.params['b{:d}'.format(j+i_out+2,)] FClayer_out = nd.dot(FClayer_out, W) + b FClayer_out = activation(FClayer_out, act_type = a) if autograd.is_training(): # 对激活函数的输出使用droupout FClayer_out = dropout(FClayer_out, self.drop_prob) if debug: print("layer{:d} shape: {}".format(j+i_out+2, FClayer_out.shape)) interlayer.append(FClayer_out) j_out = j # OUTPUT ########################################################################################################## W, b = self.params['W{:d}'.format(j_out+i_out+3,)], self.params['b{:d}'.format(j_out+i_out+3,)] yhat = nd.dot(FClayer_out, W) + b if debug: print("Output shape: {}".format(yhat.shape)) print('------------') interlayer.append(yhat) return yhat, interlayer
def net(x, is_training=False): # w1, b1, w2, b2, w3, b3 = params = initParam(verbose=True) x = x.reshape(shape=(-1, num_input)) # (256,784) # print(x.shape) x1 = nd.relu(nd.dot(x, w1) + b1) if is_training: x1 = dropout(x1, 0.8) x2 = nd.relu(nd.dot(x1, w2) + b2) if is_training: x2 = dropout(x2, 0.5) out = nd.dot(x2, w3) + b3 return out
def net(X): X = X.reshape((-1, num_inputs)) h1 = nd.dot(X, w1) + b1 h1 = nd.relu(h1) h1 = dropout(h1, dropout_prob_1) h2 = nd.dot(h1, w2) + b2 h2 = nd.relu(h2) h2 = dropout(h2, dropout_prob_2) y = nd.dot(h2, w3) + b3 return y
def rnn(inputs, H): # inputs: seq_len ? batch_size x vocab_size ?? # H: batch_size x num_hidden ?? # outputs: seq_len ? batch_size x vocab_size ?? outputs = [] for X in inputs: H = nd.tanh(nd.dot(X, Wxh) + nd.dot(H, Whh) + bh) Y = nd.dot(H, Why) + by outputs.append(Y) return (outputs, H)
def net(X): X = X.reshape((-1, num_inputs)) # 第一层全连接。 h1 = nd.relu(nd.dot(X, W1) + b1) # 在第一层全连接后添加丢弃层。 h1 = dropout(h1, drop_prob1) # 第二层全连接。 h2 = nd.relu(nd.dot(h1, W2) + b2) # 在第二层全连接后添加丢弃层。 h2 = dropout(h2, drop_prob2) return nd.dot(h2, W3) + b3
def net(self, X): X = X.reshape(-1, self.num_inputs) H1 = (nd.dot(X, self.W1) + self.b1).relu() if autograd.is_training(): H1 = dropout(H1, self.drop_prob1) H2 = (nd.dot(H1, self.W2) + self.b2).relu() if autograd.is_training(): H2 = dropout(H2, self.drop_prob2) return nd.dot(H2, self.W3) + self.b3
def function_set(self): # relu = lambda x: nd.maximum(x, 0) def relu(x): return nd.maximum(x, 0) hidden_layer_before_act = nd.dot( self.__batch_X.reshape( (-1, self.__num_input)), self.__w1) + self.__b1 hidden_layer_after_act = relu(hidden_layer_before_act) output_layer_before_act = nd.dot(hidden_layer_after_act, self.__w2) + self.__b2 return output_layer_before_act
def contrastive_divergence(self, input, lr=0.1, cdk=1, batch_size=None, shuffle=False): n_sample = input.shape[0] if batch_size == 0: batch_size = n_sample labels = nd.ones([n_sample, 1], ctx=self.ctx) dataiter = mx.io.NDArrayIter(input, labels, batch_size, shuffle, last_batch_handle='discard') for batch in dataiter: sub = batch.data[0] ph_prob, ph_sample = self.sample_h_given_v(sub) chain_start = ph_sample for step in range(cdk): if step == 0: nv_prob, nv_sample, nh_prob, nh_sample = self.gibbs_hvh( chain_start) else: nv_prob, nv_sample, nh_prob, nh_sample = self.gibbs_hvh( nh_sample) if self.M_coeff > 0: self.dW *= self.M_coeff self.dv *= self.M_coeff self.dh *= self.M_coeff self.dW += (nd.dot(sub.T, ph_prob) - nd.dot(nv_sample.T, nh_prob)) * lr / batch_size self.dv += nd.mean(sub - nv_sample, axis=0) * lr self.dh += nd.mean(ph_prob - nh_prob, axis=0) * lr else: self.dW = (nd.dot(sub.T, ph_prob) - nd.dot(nv_sample.T, nh_prob)) * lr / batch_size self.dv = nd.mean(sub - nv_sample, axis=0) * lr self.dh = nd.mean(ph_prob - nh_prob, axis=0) * lr self.W += self.dW self.vb += self.dv self.hb += self.dh self.W_decay(lr) return
def lstm(x, h, c, Wxi, Wxf, Wxo, Whi, Whf, Who, Wxc, Whc, bi, bf, bo, bc): i = nd.sigmoid(nd.dot(x, Wxi) + nd.dot(h, Whi) + bi) f = nd.sigmoid(nd.dot(x, Wxf) + nd.dot(h, Whf) + bf) o = nd.sigmoid(nd.dot(x, Wxo) + nd.dot(h, Who) + bo) c̃ = nd.tanh(nd.dot(x, Wxc) + nd.dot(h, Whc) + bc) c = f * c + i * c̃ h = o * nd.tanh(c) return h, c
def bayes_forward(self, x, dense, loss, activation_fn=None, is_target=False): weight = self.get_sample(mu=dense.weight_mu.data(), rho=dense.weight_rho.data(), is_target=is_target) bias = self.get_sample(mu=dense.bias_mu.data(), rho=dense.bias_rho.data(), is_target=is_target) loss = loss + log_gaussian(x=weight, mu=dense.weight_mu.data(), sigma=softplus(dense.weight_rho.data())) loss = loss + log_gaussian(x=bias, mu=dense.bias_mu.data(), sigma=softplus(dense.bias_rho.data())) loss = loss - log_gaussian(x=weight, mu=0., sigma=self.sigma_prior) loss = loss - log_gaussian(x=bias, mu=0., sigma=self.sigma_prior) result = nd.dot(x, weight) + bias if activation_fn is None: return result elif activation_fn == 'relu': return nd.relu(result)
def forward(self, graph, ufeat, ifeat): """Forward function. Parameters ---------- graph : DGLHeteroGraph "Flattened" user-movie graph with only one edge type. ufeat : mx.nd.NDArray User embeddings. Shape: (|V_u|, D) ifeat : mx.nd.NDArray Movie embeddings. Shape: (|V_m|, D) Returns ------- mx.nd.NDArray Predicting scores for each user-movie edge. """ graph = graph.local_var() ufeat = self.dropout(ufeat) ifeat = self.dropout(ifeat) graph.nodes['movie'].data['h'] = ifeat basis_out = [] for i in range(self._num_basis_functions): graph.nodes['user'].data['h'] = F.dot(ufeat, self.Ps[i].data()) graph.apply_edges(fn.u_dot_v('h', 'h', 'sr')) basis_out.append(graph.edata['sr']) out = F.concat(*basis_out, dim=1) out = self.rate_out(out) return out
def main(): t1 = time.time() # generating dataset num_features = 5 total = 10000 weights = [1.5, -3.4, -2.6, 7.2, -3.0] biases = 2.6 X = nd.random_normal(shape=(total, num_features),ctx=ctx) Y = weights[0] * X[:, 0] + weights[1] * X[:, 1] + weights[2] * X[:, 2] + weights[3] * X[:, 3] + weights[4] * X[:, 4] + biases # Y += nd.random_normal(shape=Y.shape) # iniitialize the parameters W_hat = nd.random_normal(shape=(num_features, 1),ctx=ctx) b_hat = nd.random_normal(shape=(1,),ctx=ctx) for i in [W_hat, b_hat]: i.attach_grad() # training epochs = 10 lr = 0.001 total_loss = 0 for epoch in range(epochs): for x_, y_ in data_iter(X, Y): with ad.record(): loss = compute_loss(nd.dot(x_ , W_hat) + b_hat, y_) loss.backward() SGD([W_hat, b_hat], lr) total_loss += nd.sum(loss).asscalar() # print("Epoch %d, average loss: %f" % (epoch, total_loss / total)) print(W_hat, b_hat) print(time.time() - t1)
def train_model(self): for param in self.__params: param.attach_grad() for e in range(self.__epochs): mean_train_loss = 0 mean_test_loss = 0 for self.__batch_X, self.__batch_y in self.train_iter(): with autograd.record(): self.__batch_y_hat = self.function_set() train_loss = self.goodness_of_function_loss_function() train_loss.backward() self.goodness_of_function_optimizer_function() mean_train_loss += nd.mean(train_loss).asscalar() test_y_hat = nd.dot(self.__X_test, self.__w) + self.__b test_loss = ((test_y_hat - self.__y_test)**2 / 2 + self.__lamda * ((self.__w**2).sum() + self.__b**2) / 2) mean_test_loss += nd.mean(test_loss).asscalar() print("Epoch %d, train average loss: %f" % (e, mean_train_loss / self.__num_train)) print("Epoch %d, test average loss: %f" % (e, mean_test_loss / self.__num_test))
def getDate(): true_w = nd.random_normal(shape=(num_input, 1)) * 0.01 true_b = 0.05 x = nd.zeros(shape=(num_train + num_test, num_input)) y = nd.dot(x, true_w) + true_b y += nd.random_normal(shape=y.shape) * 0.01 return x, y
def forward(self, inputs, is_target=False): result = None loss = 0. for _ in range(self.n_samples): tmp = inputs weights = [] biases = [] for i in range(len(self.weight_mus)): weights.append(self.get_sample( mu=self.weight_mus[i].data(), rho=self.weight_rhos[i].data(), is_target=is_target)) biases.append(self.get_sample(mu=self.bias_mus[i].data(), rho=self.bias_rhos[i].data(), is_target=is_target)) loss = loss + log_gaussian( x=weights[-1], mu=self.weight_mus[i].data(), sigma=softplus(self.weight_rhos[i].data())) loss = loss + log_gaussian(x=biases[-1], mu=self.bias_mus[i].data(), sigma=softplus(self.bias_rhos[i].data())) loss = loss - log_gaussian(x=weights[-1], mu=0., sigma=self.sigma_prior) loss = loss - log_gaussian(x=weights[-1], mu=0., sigma=self.sigma_prior) for i in range(len(weights)): tmp = nd.dot(tmp, weights[i]) + biases[i] if i != len(weights) - 1: tmp = nd.relu(tmp) if result is None: result = nd.zeros_like(tmp) result = result + tmp result = result / float(self.n_samples) loss = loss / float(self.n_samples) return result, loss
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): """ Do xWy :param x: (input_size x seq_len) x batch_size :param W: (num_outputs x ny) x nx :param y: (input_size x seq_len) x batch_size :param input_size: input dimension :param seq_len: sequence length :param batch_size: batch size :param num_outputs: number of outputs :param bias_x: whether concat bias vector to input x :param bias_y: whether concat bias vector to input y :return: [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x] x batch_size """ if bias_x: x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0) if bias_y: y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = nd.dot(W, x) if num_outputs > 1: lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size)) y = y.transpose([2, 1, 0]) # May cause performance issues lin = lin.transpose([2, 1, 0]) blin = nd.batch_dot(lin, y, transpose_b=True) blin = blin.transpose([2, 1, 0]) if num_outputs > 1: blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size)) return blin
def __init__(self, true_w, true_b, num_inputs: int, num_examples: int, batch_size: int): self.features = nd.random.normal(scale=1, shape=(num_examples, num_inputs)) self.labels = nd.dot(self.features, true_w) + true_b self.labels += nd.random.normal(scale=0.01, shape=self.labels.shape) self.batch_size = batch_size
def get_global_norm(arrays): ctx = arrays[0].context total_norm = nd.add_n(*[ nd.dot(x, x).as_in_context(ctx) for x in (arr.reshape((-1, )) for arr in arrays) ]) total_norm = nd.sqrt(total_norm).asscalar() return total_norm
def getData(): true_w = nd.ones((num_input, 1)) * 0.01 true_b = 0.05 x = nd.random_normal(shape=(num_train + num_test, num_input)) # y = nd.sun([0.01*x[i] for i in range(num_train+num_test)]) y = nd.dot(x, true_w) + true_b y += 0.01 * nd.random_normal(shape=y.shape) return x, y
def rnn(_inputs, initial_state, *parameters): # _inputs: a list with length num_steps, # corresponding element: batch_size * input_dim matrix H = initial_state W_xh, W_hh, b_h, W_hy, b_y = parameters _outputs = [] for X in _inputs: # compute hidden state from input and last/initial hidden state H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h) # compute output from hidden state Y = nd.dot(H, W_hy) + b_y _outputs.append(Y) return _outputs, H
def def_grads(reg, prims): def identity(x): return x # dot prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(g, b.T)) prims('dot').def_grad(lambda ans, a, b: lambda g: ndarray.dot(a.T, g), argnum=1) # non-linear #prims.tanh.def_grad(lambda ans, x: lambda g: g / np.cosh(x) ** 2) prims('exp').def_grad(lambda ans, x: lambda g: g * ans) prims('log').def_grad(lambda ans, x: lambda g: g / x) # reduce prims('sum').def_grad(lambda ans, x, axis=None, keepdims=False: gen_sum_grad(ans, x, axis, keepdims)) # + - * / prims('multiply').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g * y)) prims('multiply').def_grad(lambda ans, x, y: unbroadcast(ans, y, lambda g: x * g), argnum=1) prims('add').def_grad(lambda ans, x, y: unbroadcast(ans, x, identity)) prims('add').def_grad(lambda ans, x, y: unbroadcast(ans, y, identity), argnum=1) prims('subtract').def_grad(lambda ans, x, y: unbroadcast(ans, x, identity)) prims('subtract').def_grad(lambda ans, x, y: unbroadcast(ans, y, operator.neg), argnum=1) prims('divide').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g / y)) prims('divide').def_grad( lambda ans, x, y: unbroadcast(ans, y, lambda g: - g * x / (y * y)), argnum=1) prims('true_divide').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g / y)) prims('true_divide').def_grad( lambda ans, x, y: unbroadcast(ans, y, lambda g: - g * x / (y * y)), argnum=1) # mod #prims.mod.def_grad(lambda ans, x, y : unbroadcast(ans, x, identity)) #prims.mod.def_grad(lambda ans, x, y : unbroadcast(ans, y, lambda g : - g * ndarray.floor(x/y)), argnum=1) # negate prims('negative').def_grad(lambda ans, x: operator.neg) prims('transpose').def_grad(lambda ans, x: mxnet.nd.transpose) prims('abs').def_grad(lambda ans, x: lambda g: mxnet.nd.sign(x) * g) prims('sign').def_grad_zero() prims('round').def_grad_zero() prims('ceil').def_grad_zero() prims('floor').def_grad_zero() prims('sqrt').def_grad(lambda ans, x: lambda g: g * 0.5 / mxnet.nd.sqrt(x)) prims('sin').def_grad(lambda ans, x: lambda g: g * mxnet.nd.cos(x)) prims('cos').def_grad(lambda ans, x: lambda g: -g * mxnet.nd.sin(x)) prims('power').def_grad(lambda ans, x, y: unbroadcast(ans, x, lambda g: g * y * mxnet.nd.NDArray._power(x, y - 1))) prims('power').def_grad(lambda ans, x, y: unbroadcast(ans, y, lambda g: g * mxnet.nd.log(x) * ans), argnum=1) prims('reshape').def_grad(lambda _0, x, _1: lambda g: mxnet.nd.NDArray.reshape(g, x.shape))
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): """Do xWy Parameters ---------- x : NDArray (input_size x seq_len) x batch_size W : NDArray (num_outputs x ny) x nx y : NDArray (input_size x seq_len) x batch_size input_size : int input dimension seq_len : int sequence length batch_size : int batch size num_outputs : int number of outputs bias_x : bool whether concat bias vector to input x bias_y : bool whether concat bias vector to input y Returns ------- output : NDArray [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x] x batch_size """ if bias_x: x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0) if bias_y: y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = nd.dot(W, x) if num_outputs > 1: lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size)) y = y.transpose([2, 1, 0]) # May cause performance issues lin = lin.transpose([2, 1, 0]) blin = nd.batch_dot(lin, y, transpose_b=True) blin = blin.transpose([2, 1, 0]) if num_outputs > 1: blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size)) return blin
from mxnet import ndarray as nd from mxnet import autograd from mxnet import gluon num_train = 20 num_test = 100 num_inputs = 200 true_w = nd.ones((num_inputs, 1)) * 0.01 true_b = 0.05 X = nd.random.normal(shape=(num_train + num_test, num_inputs)) y = nd.dot(X, true_w) y += .01 * nd.random.normal(shape=y.shape) X_train, X_test = X[:num_train, :], X[num_train:, :] y_train, y_test = y[:num_train], y[num_train:] import matplotlib as mpl mpl.rcParams['figure.dpi']= 120 import matplotlib.pyplot as plt batch_size = 1 dataset_train = gluon.data.ArrayDataset(X_train, y_train) data_iter_train = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True) square_loss = gluon.loss.L2Loss() def test(net, X, y): return square_loss(net(X), y).mean().asscalar()
def net(X): return nd.dot(X, w) + b # return the prediction value
def __init__(self): self.num_inputs = 3 self.training_size = 1000 self.batch_size = 10 self.learning_rate = 1e-2 self.num_epochs = 5 config = Config() true_w = [2.5, 4.7, -3.2] true_b = 2.9 X = nd.random_normal(shape=(config.training_size, config.num_inputs)) y = nd.dot(X, nd.array(true_w)) + true_b y += 0.01 * nd.random_normal(shape=y.shape) def data_generator(batch_size): index = list(range(config.training_size)) random.shuffle(index) for i in range(0, config.training_size, batch_size): j = nd.array(index[i:min(i + batch_size, config.training_size)]) yield nd.take(X, j), nd.take(y, j) w = nd.random_normal(shape=(config.num_inputs, 1)) b = nd.zeros((1,)) parameters = [w, b]
from mxnet import ndarray as nd from mxnet import autograd from mxnet import gluon import mxnet as mx num_train = 20#训练集大小 num_test = 100#测试集大小 num_inputs = 200#输入神经元个数 xi的个数 #真实模型参数 true_w = nd.ones((num_inputs, 1)) * 0.01# 权重 true_b = 0.05#偏置 #生成 数据集 X = nd.random.normal(shape=(num_train + num_test, num_inputs))#输入 y = nd.dot(X, true_w) + true_b # y = 0.05 + sum(0.01*xi) y += .01 * nd.random.normal(shape=y.shape)#噪声 y = 0.05 + sum(0.01*xi) + noise X_train, X_test = X[:num_train, :], X[num_train:, :]# 0~19 行 20~99行 y_train, y_test = y[:num_train], y[num_train:] # 不断读取数据块 import random batch_size = 1 def data_iter(num_examples): idx = list(range(num_examples)) random.shuffle(idx)#打乱 for i in range(0, num_examples, batch_size): j = nd.array(idx[i:min(i+batch_size,num_examples)]) yield X.take(j), y.take(j)
def net(X): return nd.dot(X, w) + b
def net(X): X = X.reshape((-1, num_inputs)) h1 = relu(nd.dot(X, W1) + b1)# 隐含层输出 非线性激活 output = nd.dot(h1, W2) + b2 return output
def net(X): X = X.reshape((-1, num_inputs)) h1 = relu(nd.dot(X, W1 + b1)) output = nd.dot(h1, W2) + b2 return output
def model(_input): return nd.dot(_input, w) + b
def net(X): return softmax(nd.dot(X.reshape((-1,num_inputs)), W) + b)# y = softmax( a*X + b )