def visActProb(self): # negative phase super(DiscriminativeRBM,self).visActProb() self.v.apply_sigmoid() cm.dot(self.cW, self.h, target = self.c) self.c.add_col_vec(self.cb) softmax(self.c)
def test_softmax(self): np.testing.assert_array_almost_equal( softmax(np.array([[1001, 1002], [3, 4]])), np.array([[0.26894142, 0.73105858], [0.26894142, 0.73105858]])) np.testing.assert_array_almost_equal( softmax(np.array([[-1001, -1002]])), np.array([[0.73105858, 0.26894142]])) np.testing.assert_array_almost_equal( softmax(np.array([3, 4])), np.array([0.26894142, 0.73105858]))
def getError(self, data, labels, batch): # print 'get error' batchdata = data[:,batch*self.batchsize:(batch+1)*self.batchsize] batchlabels = labels[batch*self.batchsize:(batch+1)*self.batchsize] batchtargets = self.getTargets(batchlabels) results = self.forwardProp(batchdata) softmax(results) targets = cm.CUDAMatrix(batchtargets) results.subtract(targets) return results
def __call__(self, X): #A = T.exp(T.dot(T.exp(X[:-1]), self.w_trans)) + self.eps #B = T.sum(T.gammaln(A), axis=-1) - T.gammaln(T.sum(A, axis=-1)) #L = T.sum((A-1)*X[1:], axis=-1) - B A = softmax(T.dot(T.exp(X[:-1]), self.w_trans)) L = T.sum(A*X[1:], axis=-1) #A_init = T.exp(self.w_init) + self.eps #B_init = T.sum(T.gammaln(A_init)) - T.gammaln(T.sum(A_init)) #L_init = T.sum((A_init-1)*X[0], axis=-1) - B_init A_init = softmax(self.w_init) L_init = T.sum(A_init*X[0], axis=-1) return T.concatenate([T.shape_padleft(L_init), L], axis=0)
def build(self): print 'building rnn cell...' hidden_layer = RNN(self.rng, self.n_input, self.n_hidden, self.n_batch, self.x, self.Er, self.Ec, self.x_mask_r, self.x_mask_c, is_train=self.is_train, p=self.p) print 'building softmax output layer...' [h_r, h_c] = hidden_layer.activation output_layer = softmax(self.n_hidden, self.cluster_num, self.in_cluster_num, h_r, h_c) cost_r = self.categorical_crossentropy(output_layer.activation_r, self.y[:, :, 0]) cost_c = self.categorical_crossentropy(output_layer.activation_c, self.y[:, :, 1]) cost = cost_r + cost_c self.params = [ self.Er, self.Ec, ] self.params += hidden_layer.params self.params += output_layer.params lr = T.scalar('lr') gparams = [T.clip(T.grad(cost, p), -10, 10) for p in self.params] updates = self.optimizer(self.params, gparams, lr) self.train = theano.function( inputs=[ self.x, self.x_mask_r, self.x_mask_c, self.y, self.y_mask, self.n_batch, lr ], outputs=[cost], updates=updates, givens={self.is_train: np.cast['int32'](1)}) self.getNLL = theano.function( inputs=[self.x, self.x_mask_r, self.x_mask_c, self.n_batch], outputs=[output_layer.activation_r, output_layer.activation_c], givens={self.is_train: np.cast['int32'](0)}) self.predict = theano.function( inputs=[self.x, self.x_mask_r, self.x_mask_c, self.n_batch], outputs=[output_layer.predict_r, output_layer.predict_c], givens={self.is_train: np.cast['int32'](0)}) self.test = theano.function( inputs=[ self.x, self.x_mask_r, self.x_mask_c, self.y, self.y_mask, self.n_batch ], outputs=cost, givens={self.is_train: np.cast['int32'](0)})
def forward(data, label, params, dimensions): """ runs a forward pass and returns the probability of the correct word for eval. label here is an integer for the index of the label. This function is used for model evaluation. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) params[ofs:ofs + Dx * H] W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Compute the probability ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 h = sigmoid(z1) z2 = np.dot(h, W2) + b2 y_hat = softmax(z2) # J = -np.sum(label*np.log(y_hat)) J = -np.sum(np.log(y_hat) * label) return {"z1": z1, "h": h, "z2": z2, "y_hat": y_hat, "J": J}
def predict(self, X): """ Use the trained weights of this two-layer network to predict labels for data points. For each data point we predict scores for each of the C classes, and assign each data point to the class with the highest score. Inputs: - X: A numpy array of shape (N, D) giving N D-dimensional data points to classify. Returns: - y_pred: A numpy array of shape (N,) giving predicted labels for each of the elements of X. For all i, y_pred[i] = c means that X[i] is predicted to have class c, where 0 <= c < C. """ y_pred = None ########################################################################### # TODO: Implement this function; it should be VERY simple! # ########################################################################### W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N, D = X.shape h1 = np.dot(X, self.params['W1']) + self.params['b1'] # Compute the forward pass scores_1 = np.maximum(0, h1) scores = np.dot(scores_1, self.params['W2']) + self.params['b2'] ########################################################################### # END OF YOUR CODE # ########################################################################### return np.argmax(softmax(scores), axis=1)
def forward_backward_prop(data, labels, params, dimensions): ofs = 0 Dx, H, Dy = dimensions W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) z1 = np.dot( data, W1) + b1 # according to broadcast rule, b1 will extend to M dimensions h = sigmoid(z1) z2 = np.dot(h, W2) + b2 y_hat = softmax(z2) cost = -np.sum(np.log(y_hat[labels == 1])) / data.shape[0] d3 = (y_hat - labels) / data.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3, 0, keepdims=True) dh = np.dot(d3, W2.T) grad_h = sigmoid_grad(h) * dh gradW1 = np.dot(data.T, grad_h) gradb1 = np.sum(grad_h, 0) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return y_hat, cost, grad
def predict(self, X): """ Use the trained weights of this two-layer network to predict labels for data points. For each data point we predict scores for each of the C classes, and assign each data point to the class with the highest score. Inputs: - X: A numpy array of shape (N, D) giving N D-dimensional data points to classify. Returns: - y_pred: A numpy array of shape (N,) giving predicted labels for each of the elements of X. For all i, y_pred[i] = c means that X[i] is predicted to have class c, where 0 <= c < C. """ y_pred = None ########################################################################### # TODO: Implement this function; it should be VERY simple! # ########################################################################### temp1 = np.maximum(X.dot(self.params['W1'])+self.params['b1'],0) temp2 = temp1.dot(self.params['W2']+self.params['b2']) y_pred = np.argmax(softmax.softmax(temp2),axis=1) ########################################################################### # END OF YOUR CODE # ########################################################################### return y_pred
def forward(data, label, params, dimensions): """ runs a forward pass and returns the probability of the correct word for eval. label here is an integer for the index of the label. This function is used for model evaluation. """ # Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) params[ofs:ofs + Dx * H] W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Compute the probability # YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 h = sigmoid(z1) z2 = np.dot(h, W2) + b2 y_guess = softmax(z2) # END YOUR CODE return y_guess.T[label]
def forward(data, label, params, dimensions): """ runs a forward pass and returns the probability of the correct word for eval. label here is an integer for the index of the label. This function is used for model evaluation. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) params[ofs:ofs + Dx * H] W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Compute the probability z = data.dot(W1) + b1 h = sigmoid(z) theta = h.dot(W2) + b2 y_hat = softmax(theta) return y_hat[0, label]
def SoftMaxLoss(x, y): """ Computes the loss and gradient with the respect to the input for softmax classfier. Args: x: Input data. y: Labels of data. Returns: loss: Scalar softmax loss. dx: Gradient of the loss with the respect to the input x. """ ######################################################################################## # TODO: # # Compute softmax loss on input x and y and store it in loss variable. Compute gradient# # of the loss with respect to the input and store it in dx variable. # ######################################################################################## logp = softmax.softmax(x) T = np.zeros(x.shape) for i in range(len(y)): T[i, y[i]] = 1 loss = -(T * logp).sum() / x.shape[0] dx = -(T - np.exp(logp)) ######################################################################################## # END OF YOUR CODE # ######################################################################################## return loss, dx
def apply_cnn(images, labels): analyzed = 0 detected = 0 for idx, img in enumerate(images): conv_one_out = c_r.conv_relu_forward(img, c.conv1_weights, c.conv1_biases) mp_one_out = mp.maxpool_forward(conv_one_out) conv_two_out = c_r.conv_relu_forward(mp_one_out, c.conv2_weights, c.conv2_biases) mp_two_out = mp.maxpool_forward(conv_two_out) conv_three_out = c_r.conv_relu_forward(mp_two_out, c.conv3_weights, c.conv3_biases) mp_three_out = mp.maxpool_forward(conv_three_out) reshaped_out = mp_three_out.reshape(1, -1) fc_output = fc.fully_connected_forward(reshaped_out, c.local3_weights, c.local3_biases) output = s.softmax(fc_output) analyzed = analyzed + 1 assumption = np.nanargmax(output) if (assumption == labels[idx]): detected = detected + 1 print('# Analyzed: ', analyzed) print('# Detected: ', detected) print('# Rate: ', (detected / analyzed) * 100, '%')
def forward_backward_prop(data, labels, params, dimensions): """ 激活函数为sigmoid的两层神经网络的前向和后向传播 前向传播,代价函数为交叉熵函数,后向传播计算所有参数的梯度. 参数: data -- 维度为(M x Dx)的矩阵, 每行代表一个样本. labels -- 维度为(M x Dy)的矩阵, 每行是一个one-hot向量. params -- 模型的权重 dimensions -- 元组数据包括,输入维度, 隐藏层神经元的数量,输出维度 """ ### 设置网络权重 ofs = 0 # 用于提取权重,初始化为0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2] ) #输入维度, 隐藏层神经元的数量,输出维度 W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) # 输入层权重W1,维度(Dx, H) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) # 输入层权重b1,维度(1, H) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # 隐藏层权重W2,维度(H, Dy) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # 隐藏层权重b2,维度(1, Dy) ### 前向传播 M = np.shape(data)[0] z1 = np.zeros([M, H]) # 初始化z1,维度(M, H) z1 = np.dot(data, W1) + b1 # 计算z1, 维度(M, H) g1 = sigmoid(z1) # 计算g1, 维度(M, H) z2 = np.zeros([M, Dy]) # 初始化z2, 维度(M, Dy) z2 = np.dot(g1, W2) + b2 # 计算z2, 维度(M, Dy) g2 = softmax(z2) # 计算g2也就是输出, 维度(M, Dy) cost = -np.sum(labels * np.log(g2)) # 计算代价函数,交叉熵 ###后向传播 dW1 = data.T # z1对于W1的梯度 db1 = np.ones([1, M]) # z1对于b1的梯度 dz1 = sigmoid_grad(g1) # g1对于z1的梯度 dg1 = W2.T # z2对于g1的梯度 dz2 = g2 - labels # 代价函数对于z2的导数 dW2 = g1.T # z2对于W2的导数 db2 = np.ones([1, M]) # z2对于b2的导数 gradW1 = np.dot(dW1, np.dot(dz2, dg1) * dz1) # 利用链式法则计算代价对于W1的导数 gradb1 = np.dot(db1, np.dot(dz2, dg1) * dz1) # 利用链式法则计算代价对于b1的导数 gradW2 = np.dot(dW2, dz2) # 利用链式法则计算代价对于W2的导数 gradb2 = np.dot(db2, dz2) # 利用链式法则计算代价对于b2的导数 ### 保存梯度 grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def __init__(self, n_in, n_out, layers, decoder=linear.Linear, itype='int32' , solver=solvers.RMSprop(0.01)): self.data = T.matrix(dtype=itype) self.x = self.data[:-1] # T.matrix(dtype=itype) self.y = self.data[1:] # T.matrix(dtype=itype) self.mask = T.matrix(dtype='int32') self.weights = [] k,b = self.x.shape y_layer = self.x self.y_layers = [] m = n_in for n in layers: layer = lstm.LSTM(m, n) self.weights.append(layer.weights) y0 = T.zeros((b, n)) c0 = T.zeros((b, n)) y_layer, _ = layer.scanl(y0, c0, y_layer) self.y_layers.append(y_layer) m = n decode = decoder(m, n_out) self.weights.append(decode.weights) yh = decode(y_layer) self.yh = softmax.softmax(yh) self.loss_t = T.sum(crossent.crossent(self.yh, self.y)*self.mask[1:]) self.correct = T.sum(T.eq(T.argmax(self.yh, axis=2), self.y)*self.mask[1:]) self.count = T.sum(self.mask[1:]) self.solver = solver #compile theano functions self._loss = theano.function([self.data, self.mask], [self.loss_t, self.correct, self.count]) self._activations = theano.function([self.data], self.y_layers+[self.yh], givens={self.x:self.data})
def forward_prop(X_in, W1, b1, W2, b2): X_in = activate(X_in, W1, b1) # Select Row 0 i.e. Data Point 0 => 1*H X_in = sigmoid( X_in) # Compute the Output of the Hidden Nodes of the Layer => 1*H X_in = activate(X_in, W2, b2) # Computes next Layer (in this Case Final Layer) return softmax(X_in)
def evaluate(self): print("evaluating...") total = 0 correct = 0 num = 0 for datas, labels in self.test_loader: num += 1 datas = Variable(datas.type(dtype1)) outputs = self.model(datas) outputs = outputs.type(torch.LongTensor) temp = outputs.data.numpy() s = softmax(temp) # print(s) if abs(s[0][0] - s[0][1]) > 0.5: labels = labels.type(torch.LongTensor) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() if num % 1000 == 0: print("%d test samples have been evaluated." % num) acc = 100 * correct / total print("total: %d" % total) self.writer.add_scalar('data/accuracy', acc, self.step) if self.save_log == True: with open(record_file, "a+") as f: print("Accuracy: %.4f %%" % acc, file=f) else: print("Accuracy: %.4f %%" % acc) self.save_model(self.interval)
def forward(self, x, t, batch_size): self.t = t self.y = Softmax.softmax(x, batch_size) self.loss = Min_batch_cross_entropy_error.cross_entropy_error( self.y, self.t) return self.loss #这是单个标准loss而不是整个batch_size的
def forwardpass(X, W1, b1, W2, b2, activation = "sigmoid"): if(activation == "sigmoid"): z1 = 1 / (1 + exp(-X.dot(W1) - b1)) else: z1 = tanh(X.dot(W1) + b1) z2 = z1.dot(W2) + b2 ret = softmax(z2) return ret, z1
def predict(self,x): w1,w2=self.params['W1'],self.params['W2'] b1,b2=self.params['b1'],self.params['b2'] a1=np.dot(x,w1)+b1 z1=sigmoid(a1) a2=np.dot(z1,w2)+b2 y=softmax(a2) return y
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation x = data y = labels h = sigmoid(np.matmul(x, W1) + b1) y_hat = softmax(np.matmul(h, W2) + b2) cost = -np.sum(y * np.log2(y_hat)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation #calc grads y_diff = y_hat - y gradW2 = np.transpose(np.matmul(np.transpose(y_diff), h)) gradb2 = np.expand_dims(np.sum((y_diff), axis=0), axis=0) gradW1 = np.matmul(np.transpose(x), np.matmul((y_diff), np.transpose(W2)) * sigmoid_grad(h)) gradb1 = np.expand_dims(np.sum(np.matmul( (y_diff), np.transpose(W2)) * sigmoid_grad(h), axis=0), axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 # z is an M x H matrix, row for each batch sample h = sigmoid(z1) # h is also an M x H matrix, apply sigmoid on each matrix element z2 = np.dot(h, W2) + b2 # z2 is an M x Dy matrix y_hat = softmax(z2) # y_pred is also an M x Dy matrix ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost = -np.sum(labels * np.log(y_hat)) delta_2 = y_hat - labels # an M x Dy matrix delta_1 = np.dot(delta_2, W2.T) * sigmoid_grad(h) # an M x H matrix gradb1 = np.sum(delta_1, axis=0) # 1 x H vector gradb2 = np.sum(delta_2, axis=0) # 1 X Dy vector gradW1 = np.dot(data.T, delta_1) # Dx x H matrix gradW2 = np.dot(h.T, delta_2) # H x M matrix ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def predict(network,x): w1,w2,w3=network['W1'],network['W2'],network['W3'] b1,b2,b3=network['b1'],network['b2'],network['b3'] a1=np.dot(x,w1)+b1 z1=sigmoid(a1) a2=np.dot(z1,w2)+b2 z2=sigmoid(a2) a3=np.dot(z2,w3)+b3 y=softmax(a3) return y
def predict(self,x): W1, W2 = self.params['W1'], self.params['W2'] b1, b2 = self.params['b1'], self.params['b2'] #전에 다뤘던 함수 a1 = np.dot(x, W1)+b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2)+b2 y = softmax(a2) return y
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z = data.dot(W1) + b1 h = sigmoid(z) theta = h.dot(W2) + b2 y_hat = softmax(theta) cost = -np.sum(labels * np.log2(y_hat)) # cross entropy ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta_1 = y_hat - labels delta_2 = np.dot(delta_1, W2.transpose()) delta_3 = np.multiply(delta_2, sigmoid_grad(h)) gradW2 = np.dot(h.T, delta_1) gradb2 = np.sum(delta_1, axis=0) gradW1 = np.dot(data.T, delta_3) gradb1 = np.sum(delta_3, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def predict(network, x): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] a1 = np.dot(x, W1) + b1 z1 = sig.sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = sig.sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = sm.softmax(a3) return y
def predict(network, X): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] A1 = np.dot(X, W1) + b1 Z1 = sigmoid(A1) A2 = np.dot(Z1, W2) + b2 Z2 = sigmoid(A2) A3 = np.dot(Z2, W3) + b3 Y = softmax(A3) return Y
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE # softmax(uv^) See Assignment 1 3a. Cost function is nothing but softmax of dot product softmax_prob = softmax(predicted.dot(outputVectors.T)) cost = -np.log(softmax_prob[target]) #cross entropy cost # In both gradients, the value of target softmax_prob is reduced by 1. (check solution and expand) y_cap = softmax_prob y_cap[target] -= 1 N = outputVectors.shape[0] # No. of output vectors D = outputVectors.shape[1] # No. of output dimensions # We need N X D Matrix for output vectors # y_cap : Probabilities of each of N vectors # predicted : 1 vector with D dimensions (Vc) # grad : ( y_cap -1 )*Vc for target # ( y_cap )*Vc for other dimensions # We subtracted 1 alreadyso vectorially now ( y_cap )*Vc grad = y_cap.reshape(N, 1) * predicted.reshape(1, D) # We need 1 X D vector for predicted vector gradient gradPred = (y_cap.reshape(1, N).dot(outputVectors)).flatten() ### END YOUR CODE return cost, gradPred, grad
def click_softmax(): #softmax 버튼 클릭시 이벤트 print("softmax") learn, bat, epo = get_paramiters() X, Y, Y_one_hot, hypothesis, cost, optimizer = sf.softmax(learn) printbar.delete(1.0, END) sess = learning(epo, bat, mnist_train_x, mnist_train_y, cost, optimizer, X, Y, printbar) prediction = accuracy(hypothesis, Y_one_hot, mnist_test_x, mnist_test_y, sess, X, Y, printbar) #if click_details(): details(sess, prediction, X, mnist_test_x, mnist_test_y, printbar)
def forward_backward_prop(data, labels, params, dimensions=[10, 5, 10]): """ Forward and backward propagation for a two-layer sigmoidal network """ ################################################################### # Compute the forward propagation and for the cross entropy cost, # # and backward propagation for the gradients for all parameters. # ################################################################### # Unpack network parameters (do not modify) t = 0 W1 = np.reshape(params[t:t + dimensions[0] * dimensions[1]], (dimensions[0], dimensions[1])) t += dimensions[0] * dimensions[1] b1 = np.reshape(params[t:t + dimensions[1]], (1, dimensions[1])) t += dimensions[1] W2 = np.reshape(params[t:t + dimensions[1] * dimensions[2]], (dimensions[1], dimensions[2])) t += dimensions[1] * dimensions[2] b2 = np.reshape(params[t:t + dimensions[2]], (1, dimensions[2])) # YOUR CODE HERE: forward propagation Z1 = np.dot(data, W1) + b1 # broadcasting on b1 A1 = sigmoid(Z1) Z2 = np.dot(A1, W2) + b2 Yhat = softmax(Z2) # Y output # cost = ... index = labels == 1 logYhat = np.log(Yhat) cost = -np.sum(logYhat[index]) # END YOUR CODE # YOUR CODE HERE: backward propagation targets = np.zeros(np.shape(Yhat)) targets[index] = 1 dZ2 = Yhat - targets db2 = sum(dZ2) dW2 = np.dot(A1.T, dZ2) dA1 = dZ2.dot(W2.T) dZ1 = np.multiply(sigmoid_grad(A1), dA1) db1 = sum(dZ1) dW1 = np.dot(data.T, dZ1) gradb1 = db1 gradW1 = dW1 gradb2 = db2 gradW2 = dW2 # END YOUR CODE # Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def predict(NN, x): W1, W2, W3 = NN['W1'], NN['W2'], NN['W3'] b1, b2, b3 = NN['b1'], NN['b2'], NN['b3'] a1 = np.dot(x, W1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = softmax(a3) return y
def forward(network, X): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] a1 = transport_signal(X, W1, b1) z1 = sigmoid(a1) a2 = transport_signal(z1, W2, b2) z2 = sigmoid(a2) a3 = transport_signal(z2, W3, b3) return softmax(a3)
def predict(network, x): w1, w2, w3 = network["W1"], network["W2"], network["W3"] b1, b2, b3 = network["b1"], network["b2"], network["b3"] a1 = np.dot(x, w1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, w2) + b2 z2 = sigmoid(a2) a3 = np.dot(z2, w3) + b3 y = softmax(a3) return y
def predict(network, X): W1, W2, W3 = network['W1'], network['W2'], network['W3'] B1, B2, B3 = network['b1'], network['b2'], network['b3'] A1 = np.dot(X, W1) + B1 Z1 = sigmoid(A1) A2 = np.dot(Z1, W2) + B2 Z2 = sigmoid(A2) A3 = np.dot(Z2, W3) + B3 Y = softmax(A3) return Y
def postFeature(self): featurelist = os.listdir(self.featurefolder) for featurefile in featurelist: filename = os.path.join(self.featurefolder,featurefile) feature = scipy.io.loadmat(filename) featuremat = feature['data'].T# each column is a feature pre_softmax = self.forwardProp(featuremat) postfeature_gpu = softmax(pre_softmax) postfeature_gpu.copy_to_host() postfeature = postfeature_gpu.numpy_array d = {} d['data']=postfeature savefilename = os.path.join(self.postfeaturefolder, featurefile) scipy.io.savemat(savefilename, d)
def forward(self,states,ground_truth,final_label=False): err=0.0 outputs=[] num=len(states) hidden_states=copy.copy(self.s) for idx,token in enumerate(states): hidden_states[0]=self.activation[0](np.dot(self.U[0],token)+np.dot(self.W[0],hidden_states[0])) for i in xrange(1,self.hidden_layers): hidden_states[i]=self.activation[0](np.dot(self.U[i],hidden_states[i-1])+np.dot(self.W[i],hidden_states[i])) if idx==len(states)-1 or not final_label: proj=np.dot(self.V,hidden_states[-1]) soft=softmax.softmax(proj) logsoft=np.log(soft) err-=np.dot(ground_truth[0],logsoft) if final_label else np.dot(ground_truth[idx],logsoft) outputs.append(soft) err=err if final_label else err/num return err,outputs
def __init__(self, models, itype='int32', solver=solvers.RMSprop(0.01)): self.x = T.matrix(dtype=itype) self.mask = T.matrix(dtype='int32') self.y = T.vector(dtype=itype) self.weights = [] self.logprobs = [] self.labels = [] for label, model in models: yh = theano.clone(model.yh, {model.x: self.x[:-1], model.y: self.x[1:]}) logprob = -T.sum(crossent.crossent(yh, self.x[1:])*self.mask[1:], axis=0) self.weights.extend(model.weights) self.logprobs.append(logprob) self.labels.append(label) self.logprobs = T.stack(self.logprobs, axis=1) self.yh = softmax.softmax(self.logprobs) self.loss_t = T.sum(crossent.crossent(self.yh, self.y)) self.correct = T.sum(T.eq(T.argmax(self.yh, axis=1), self.y)) self.count = self.y.size self.solver = solver #compile theano functions self._loss = theano.function([self.x, self.mask, self.y] , [self.loss_t, self.correct, self.count])
def gradient(model,states,ground_truth,final_label=False): hidden_layers=model.hidden_layers neurons=model.size # Global states hidden_states=copy.copy(model.s) #hidden states, should be deep copy lamb=copy.copy(model.s) #lambda, size shape of hidden states err=0.0 # gradient used by iteration dSdU=[[] for i in xrange(hidden_layers)] #dSdU[n1,n2]=dS[n1]/dU[n2], n1>=n2 dSdW=[[] for i in xrange(hidden_layers)] #dSdW[n1,n2]=dS[n1]/dW[n2], n1>=n2 dSds=[[] for i in xrange(hidden_layers)] #dSds[n1,n2]=dS[n1]/ds[n2], n1>=n2 for n1 in xrange(hidden_layers): for n2 in xrange(n1+1): dSdU_n1n2=np.zeros([neurons[n2+1],neurons[n2],neurons[n1+1]]) #dSdU[n1,n2][i,j,k]=dS[n1][k]/dU[n2][i,j] dSdW_n1n2=np.zeros([neurons[n2+1],neurons[n2+1],neurons[n1+1]]) #dSdW[n1,n2][i,j,k]=dS[n1][k]/dW[n2][i,j] dSds_n1n2=np.zeros([neurons[n1+1],neurons[n2+1]]) if n1!=n2 else np.eye(neurons[n1+1]) #dSds[n1,n2][i,j]=dS[n1][i]/ds[n2][j] dSdU[n1].append(dSdU_n1n2) dSdW[n1].append(dSdW_n1n2) dSds[n1].append(dSds_n1n2) weight=1.0 if final_label else 1.0/len(states) for idx,token in enumerate(states): # Save the value of old hidden states, useful to update the gradient hidden_states_old=copy.copy(hidden_states) # Forward Propagation linear_comb=np.dot(model.U[0],token)+np.dot(model.W[0],hidden_states[0]) hidden_states[0]=model.activation[0](linear_comb) lamb[0]=model.dactivation[0](linear_comb) for i in xrange(1,hidden_layers): linear_comb=np.dot(model.U[i],hidden_states[i-1])+np.dot(model.W[i],hidden_states[i]) hidden_states[i]=model.activation[i](linear_comb) lamb[i]=model.dactivation[i](linear_comb) # R[n][i,j]=dS[n][i]_t/dS[n-1][j]_t # S[n][i,j]=dS[n][i]_t/dS[n][j]_{t-1} R=[];S=[] for i in xrange(hidden_layers): Ri=np.dot(np.diag(lamb[i]),model.U[i]) Si=np.dot(np.diag(lamb[i]),model.W[i]) R.append(Ri) S.append(Si) for n1 in xrange(hidden_layers): for n2 in xrange(n1): dSdU[n1][n2]=batchProduct.nXone(dSdU[n1-1][n2],R[n1].T)+batchProduct.nXone(dSdU[n1][n2],S[n1].T) dSdW[n1][n2]=batchProduct.nXone(dSdW[n1-1][n2],R[n1].T)+batchProduct.nXone(dSdW[n1][n2],S[n1].T) dSds[n1][n2]=np.dot(dSds[n1-1][n2],R[n1].T)+np.dot(dSds[n1][n2],S[n1].T) dSdU[n1][n1]=batchProduct.nXone(dSdU[n1][n1],model.W[n1]) dSdW[n1][n1]=batchProduct.nXone(dSdW[n1][n1],model.W[n1]) for i in xrange(neurons[n1+1]): for j in xrange(neurons[n1+1]): dSdW[n1][n1][i,j,i]+=hidden_states_old[n1][j] for j in xrange(neurons[n1]): if n1>0: dSdU[n1][n1][i,j,i]+=hidden_states[n1-1][j] else: dSdU[n1][n1][i,j,i]+=token[j] dSdU[n1][n1]=batchProduct.nXone(dSdU[n1][n1],np.diag(lamb[n1])) dSdW[n1][n1]=batchProduct.nXone(dSdW[n1][n1],np.diag(lamb[n1])) dSds[n1][n1]=np.dot(np.diag(lamb[n1]),np.dot(model.W[n1],dSds[n1][n1])) # Have supervised signal -> update the gradient if idx==len(states)-1 or not final_label: proj=np.dot(model.V,hidden_states[-1]) soft=softmax.softmax(proj) logsoft=np.log(soft) token_truth=ground_truth[0] if final_label else ground_truth[idx] # !! to avoid inf*0=nan for dim_idx,token_truth_value in enumerate(token_truth): if token_truth_value>1e-8: err-=token_truth_value*logsoft[dim_idx]*weight # err-=np.dot(token_truth,logsoft)*weight # Update V dEdV=np.dot((soft-token_truth).reshape(neurons[-1],1),hidden_states[-1].reshape(1,neurons[-2])) model.gV+=dEdV*weight # Update U,W,s dEdS=np.dot(model.V.T,(soft-token_truth).reshape(neurons[-1],1)) for n in xrange(hidden_layers): dEdUi=batchProduct.nXone(dSdU[-1][n],dEdS).squeeze() dEdWi=batchProduct.nXone(dSdW[-1][n],dEdS).squeeze() dEdsi=np.dot(dSds[-1][n].T,dEdS).squeeze() model.gU[n]+=dEdUi*weight model.gW[n]+=dEdWi*weight model.gs[n]+=dEdsi*weight model.buffer+=1 return err
# Evaluate model correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # initializing init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) step = 1 i = 0 while step*batch_size < training_iter: batch_x = train[i*batch_size*num_steps:((i+1)*batch_size*num_steps)] batch_x = batch_x.reshape((batch_size, num_steps, dim_input)) batch_y = label[i*batch_size:((i+1)*batch_size)] if (i+1)*batch_size >= len(train): i = 0 sess.run(optimizer, feed_dict={x: batch_x, y: batch_y}) if step % display_step == 0: acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y}) loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y}) print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + \ "{:.6f}".format(loss) + ", Training Accuracy= " + \ "{:.5f}".format(acc) step += 1 test = test.reshape((1, num_steps, dim_input)) test_pred = sess.run(pred, feed_dict={x: test}) print softmax(test_pred)
def getPostProb(self,data): # get the posterior prob. of a feature file results = self.forwarProp(data)# pre-softmax results softmax(results) return results
def forward_backward_prop(dimensions, data, labels, params): """ Forward and backward propagation for a two-layer sigmoidal network """ ################################################################### # Compute the forward propagation and for the cross entropy cost, # # and backward propagation for the gradients for all parameters. # ################################################################### ### Unpack network parameters (do not modify) t = 0 W1 = np.reshape(params[t : t + dimensions[0] * dimensions[1]], (dimensions[0], dimensions[1])) t += dimensions[0] * dimensions[1] b1 = np.reshape(params[t : t + dimensions[1]], (1, dimensions[1])) t += dimensions[1] W2 = np.reshape(params[t : t + dimensions[1] * dimensions[2]], (dimensions[1], dimensions[2])) t += dimensions[1] * dimensions[2] b2 = np.reshape(params[t : t + dimensions[2]], (1, dimensions[2])) ### YOUR CODE HERE: forward propagation # cost = ... # labels is (20, 10) (20 1-hot vectors) - this is y # data is (20, 10) - this is x # W1 is (10, 5) # W2 is (5, 10) # b1 is (1, 5) # b2 is (1, 10) a = data.dot(W1) + b1 h = sigmoid(a) # hidden layer y_hat = softmax(h.dot(W2) + b2) # Top classifier layer N, D = data.shape (Dx, H) = W1.shape # TODO: may need to change this to sum over rows and then sum up rows? # cost = np.sum(-np.sum(np.multiply(labels, np.log(y_hat)), axis=1).reshape((N, 1))) cost_per_datapoint = -np.sum(labels * np.log(y_hat), axis=1).reshape((N, 1)) # sum over rows cost = np.sum(cost_per_datapoint) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # gradW1 = ... # gradb1 = ... # gradW2 = ... # gradb2 = ... # d_y_hat/d_W2 J_theta = y_hat - labels # theta_W2 = h # theta_h = W2 h_a = h * (1.0 - h) a_W1 = data y_hathw = J_theta.dot(W2.T) * h_a gradW2 = h.T.dot(J_theta) gradW1 = data.T.dot(y_hathw) # gradW1 = np.dot(data.T, np.dot(J_theta, theta_h.T) * h_a) gradb1 = np.sum(y_hathw, axis=0).reshape((1, H)) gradb2 = np.sum(J_theta, axis=0).reshape((1, D)) assert gradW1.shape == W1.shape assert gradb1.shape == b1.shape assert W2.shape == gradW2.shape assert gradb2.shape == b2.shape ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def test_softmax(self): np.testing.assert_array_almost_equal( softmax(np.array([[1001, 1002], [3, 4]])), np.array([[0.26894142, 0.73105858], [0.26894142, 0.73105858]]) ) np.testing.assert_array_almost_equal(softmax(np.array([[-1001, -1002]])), np.array([[0.73105858, 0.26894142]])) np.testing.assert_array_almost_equal(softmax(np.array([3, 4])), np.array([0.26894142, 0.73105858]))
def hidActProb(self,vis, target): cm.dot(self.W.T, vis, target = target) target.add_col_vec(self.hb) softmax(target)
import sys sys.path.insert(0,'util/') import softmax import numpy as np vectors=np.random.random([100,100]) vector1=vectors[0] print softmax.softmax(vector1) for i in xrange(100): vector=softmax.softmax(vectors[i]) assert(np.sum(vector)>0.9999 and np.sum(vector)<1.0001)