def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding in shape (word vector length, ) (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors is in shape (num words in vocab, word vector length) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector in shape (word vector length, ) (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors in shape (num words in vocab, word vector length) (dJ / dU) """ ### YOUR CODE HERE (~6-8 Lines) dot_prods = outsideVectors.dot(centerWordVec) y_hat = softmax(dot_prods) loss = -np.log(y_hat)[outsideWordIdx] diff = y_hat diff[outsideWordIdx] -= 1 # y_hat-y gradCenterVec = diff.dot(outsideVectors) diff = diff.reshape(-1, 1) # Making y_hat-y a column vector vc = centerWordVec.reshape(1, -1) # Making centerWordVec a row vector gradOutsideVecs = diff.dot(vc) ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient( centerWordVec, outsideWordIdx, outsideVectors, dataset ): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. y_hat = softmax(np.dot(outsideVectors, centerWordVec)) loss = -np.log(y_hat[outsideWordIdx]) diff = y_hat.copy() diff[outsideWordIdx] -= 1 # y_hat - y gradCenterVec = np.dot(diff, outsideVectors) gradOutsideVecs = np.dot(diff[:, np.newaxis], centerWordVec[:, np.newaxis].T) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE N = np.shape(centerWordVec)[0] V = np.shape(outsideVectors)[0] y_pred = softmax(np.transpose(np.matmul(outsideVectors, centerWordVec))) y_label = np.zeros(shape=(V, )) y_label[outsideWordIdx] = 1 loss = -np.log(y_pred[outsideWordIdx, ]) gradCenterVec = np.matmul(np.transpose(outsideVectors), np.reshape(y_pred - y_label, (V, 1))) gradCenterVec = np.reshape(gradCenterVec, (N, )) gradOutsideVecs = np.matmul( np.repeat(np.reshape(centerWordVec, (N, 1)), V, axis=1), np.diag(y_pred - y_label)) gradOutsideVecs = np.transpose(gradOutsideVecs) ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient( centerWordVec, outsideWordIdx, outsideVectors, dataset ): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. # the embedding is n-dimensional centerWordVec = centerWordVec.reshape(-1, 1) # n * 1 scores = np.dot(outsideVectors, centerWordVec).reshape(-1, ) probs = softmax(scores).reshape(-1, 1) loss = -np.log(probs[outsideWordIdx, 0]) gradCenterVec = -outsideVectors[outsideWordIdx, :] + np.dot(probs.T, outsideVectors) gradOutsideVecs = np.dot(probs, centerWordVec.T) gradOutsideVecs[outsideWordIdx, :] -= (centerWordVec).reshape(-1,) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. probs = softmax(outsideVectors.dot(centerWordVec)) tj = (np.argmax(probs) == outsideWordIdx) yj = probs[outsideWordIdx] loss = -np.log(yj) Tj = np.zeros((probs.shape)) Tj[outsideWordIdx] = 1 gradOutsideVecs = (probs - Tj).reshape(-1, 1) * (centerWordVec).reshape( 1, -1) gradCenterVec = np.sum((probs - Tj).reshape(-1, 1) * outsideVectors, axis=0) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U (|V| x n) in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ # YOUR CODE HERE # Please use the provided softmax function (imported earlier in this file) # This numerically stable implementation helps you avoid issues pertaining # to integer overflow. scores = outsideVectors @ centerWordVec prob = softmax(scores)[:, np.newaxis] loss = float(-np.log(prob[outsideWordIdx])) trueOutsideVec = outsideVectors[outsideWordIdx] gradCenterVec = -trueOutsideVec + np.sum(outsideVectors * prob, axis=0) gradOutsideVecs = np.dot(prob, centerWordVec[:, np.newaxis].T) gradOutsideVecs[outsideWordIdx] -= centerWordVec # END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. temp = centerWordVec * outsideVectors temp = np.sum(temp, axis=1) soft = softmax(temp) loss = -1 * np.log(soft[outsideWordIdx]) gradCenterVec = np.sum(outsideVectors * soft.reshape(soft.shape[0], 1), axis=0) - outsideVectors[outsideWordIdx] gradOutsideVecs = soft.reshape(soft.shape[0], 1) * np.repeat( centerWordVec, repeats=outsideVectors.shape[0], axis=0) gradOutsideVecs[outsideWordIdx] -= centerWordVec.reshape( centerWordVec.shape[1]) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient( centerWordVec, outsideWordIdx, outsideVectors, dataset ): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. y_hat = softmax(centerWordVec @ outsideVectors.T) loss = -np.log(y_hat)[outsideWordIdx] ## compute derivative to center word ## referring to: https://courses.cs.ut.ee/MTAT.03.277/2015_fall/uploads/Main/word2vec.pdf Diff = y_hat.copy() Diff[outsideWordIdx] -= 1 gradCenterVec = outsideVectors.T @ Diff gradOutsideVecs = np.expand_dims(Diff, axis=1) @ np.expand_dims(centerWordVec, axis=1).T ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. logits = np.matmul(np.transpose(outsideVectors), centerWordVec) y_bar = softmax(logits) yo_bar = y_bar[outsideWordIdx] loss = -np.log(yo_bar) N, V = outsideVectors.shape y = np.zeros(V) y[outsideWordIdx] = 1 gradCenterVec = np.matmul(np.transpose(outsideVectors), y_bar - y) gradOutsideVecs = np.outer(outsideVectors, y_bar - y) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ # 정리 : 총 네가지를 인자로 받아 naive soft max loss를 계산하여 그 loss와 이를 바탕으로 gradient를 수정한 임베딩 벡터 두 개 반환 ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. #softmax 취하고 이를 이용해 손실을 정의함. _softmax = softmax(np.dot(outsideVectors, centerWordVec)) #이 친구가 y_hat이다. loss = -np.log(_softmax[outsideWordIdx]) #handout에서 -log(y_hat_o) 라고 정의 #Gradient Descent _softmax[ outsideWordIdx] -= 1 # (y_hat - y)를 해 줌. 어차피 y는 원핫벡터이기 때문에 outsidewordidx가 있는 곳만 1이므로 그냥 y_hat에서 그 부분에서만 1빼주면 되는 것 gradCenterVec = np.dot( outsideVectors.T, _softmax ) # 원래는 U(y_hat-y)인데 위에서 y_hat-y 했으니까 그냥 U*y_hat 곱함. 차원 맞춰야해서 Transpose함. gradOutsideVecs = np.dot(np.expand_dims(_softmax, axis=1), np.expand_dims(centerWordVec, axis=0)) #리스트 형태를 행렬로 만들어주기 위해 expand_dims를 함으로써 각각 V*1과 1*N으로 만들고 둘을 곱함으로써 최종 결과로 outsidevector의 차원인 V*N이 나옴. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding in shape (word vector length, ) (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors is in shape (num words in vocab, word vector length) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector in shape (word vector length, ) (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors in shape (num words in vocab, word vector length) (dJ / dU) """ ### YOUR CODE HERE (~6-8 Lines) ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. logits = softmax(outsideVectors @ centerWordVec) loss = -np.log(logits[outsideWordIdx]) label = np.zeros_like(logits) label[outsideWordIdx] = 1 gradCenterVec = (logits - label) @ outsideVectors gradOutsideVecs = np.outer((logits - label), centerWordVec) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding in shape (word vector length, ) (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors is in shape (num words in vocab, word vector length) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector in shape (word vector length, ) (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors in shape (num words in vocab, word vector length) (dJ / dU) """ # YOUR CODE HERE (~6-8 Lines) y_hat = softmax(outsideVectors @ centerWordVec.T) # (N,1) # print(y_hat.shape) loss = -np.log(y_hat[outsideWordIdx]) gradCenterVec = y_hat.T @ outsideVectors - outsideVectors[outsideWordIdx] y_hat_minus = y_hat.copy() y_hat_minus[outsideWordIdx] -= 1 gradOutsideVecs = y_hat_minus.reshape(-1, 1) @ centerWordVec.reshape(1, -1) # Please use the provided softmax function (imported earlier in this file) # This numerically stable implementation helps you avoid issues pertaining # to integer overflow. # END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. # centerWordVec: (embedding_dim,1) # outsideVectors: (vocab_size,embedding_dim) scores = np.matmul(outsideVectors, centerWordVec) # (vocab_size,1) probs = softmax(scores) # (vocab_size,1) y_hat loss = -np.log(probs[outsideWordIdx]) dscores = probs.copy() # (vocab_size,1) dscores[outsideWordIdx] = dscores[outsideWordIdx] - 1 # y_hat minus y gradCenterVec = np.matmul(outsideVectors.T, dscores) # (embedding_dim,1) gradOutsideVecs = np.outer(dscores, centerWordVec) # (vocab_size,embedding_dim) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. value = np.dot(outsideVectors, centerWordVec) # N x 1 y_hat = softmax(value) loss = -np.log(y_hat[outsideWordIdx]) # Written assignment part a d_value = y_hat d_value[outsideWordIdx] -= 1 # y_hat - y, matrix shape (N, 1) gradCenterVec = outsideVectors.T.dot( d_value) # shape d x 1 Written Assignment part b gradOutsideVecs = d_value[:, np.newaxis].dot(np.array([ centerWordVec ])) # (N, 1) dot (1, d) -> (N, d) written assignment part c ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient( centerWordVec, outsideWordIdx, outsideVectors, dataset ): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding in shape (word vector length, ) (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors is in shape (num words in vocab, word vector length) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector in shape (word vector length, ) (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors in shape (num words in vocab, word vector length) (dJ / dU) """ ### YOUR CODE HERE (~6-8 Lines) softmax_result = softmax(np.dot(outsideVectors, centerWordVec)) naive_softmax_loss = - np.log(softmax_result)[outsideWordIdx] softmax_result[outsideWordIdx] -= 1 gradCenterVec = np.dot(outsideVectors.T, softmax_result) gradOutsideVecs = np.dot(centerWordVec.reshape(centerWordVec.shape[0], 1), softmax_result.reshape(softmax_result.shape[0], 1).T) ### END YOUR CODE return naive_softmax_loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient( centerWordVec, outsideWordIdx, outsideVectors, dataset ): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE y = np.zeros(outsideVectors.shape[0]) # y: (N, ) y[outsideWordIdx] = 1. y_hat = softmax(centerWordVec @ outsideVectors.T) # y_hat: (N, ) loss = -np.log(y_hat[outsideWordIdx]) # -log(y_o_hat) gradCenterVec = (y_hat - y) @ outsideVectors # (D, ) gradOutsideVecs = np.expand_dims(y_hat.T, axis=1) @ np.expand_dims(centerWordVec, axis=0) # (N, D) gradOutsideVecs[outsideWordIdx] -= centerWordVec ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def make_prediction_knn_weighted(self, x, y, k, source_som, target_som, source, mode='none'): source_activation, pos_source_activation = source_som.get_activations( x) source_activation = np.array(source_activation).reshape((-1, 1)) target_activation = self.propagate_activation(source_activation, source_som=source) # vote weighting alternatives if mode == 'softmax': # normalize using softmax. this brings some 0-valued votes to higher values vote_weights = softmax(target_activation) elif mode == 'none': # since hebbian weights and activations are normalized, the propagated activation's values # are already between 0 and 1 vote_weights = target_activation elif mode == 'minmax': # minimum activation is mapped to 0 and maximum to 1 min_ = min(target_activation) max_ = max(target_activation) vote_weights = (target_activation - min_) / float(max_ - min_) hebbian_bmu_index = np.argmax(target_activation) pos_activations = list( target_som._neuron_locations(target_som._m, target_som._n)) closest_activations, closest_indexes = self.get_bmu_k_closest( target_som, target_activation, pos_activations, k) # perform a weighted majority vote class_count = [ 0 for i in set( [c[0] for c in target_som.bmu_class_dict.values() if c != []]) ] for i in range(len(closest_indexes)): print(closest_indexes[i]) bmu_class_list = target_som.bmu_class_dict[closest_indexes[i]] if bmu_class_list != []: class_count[ bmu_class_list[0]] += 1 * vote_weights[closest_indexes[i]] print(class_count) return np.argmax(class_count)
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ## 注意:coding部分的矢量和矩阵与written部分互为转置 ## 比如词向量在coding部分就是行向量,在written部分就是列向量 y_hat = softmax(np.dot(centerWordVec.reshape(1, -1), outsideVectors.T)) #1行n列 loss = -np.log(y_hat[0][outsideWordIdx]) delta = y_hat.copy() # delta表示y_hat - y,是1行n列的矢量 delta[0][outsideWordIdx] = delta[0][outsideWordIdx] - 1 gradCenterVec = np.dot(delta, outsideVectors) #1行d列 gradOutsideVecs = np.dot(delta.T, centerWordVec.reshape(1, -1)) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. inner_product = centerWordVec.dot( outsideVectors.T ) # centerWordVec has shape (1, d) and outsideVectors has shape (V, d) y_hat = softmax(inner_product) # should have shape (1, V), y = np.zeros(y_hat.shape) y[0, outsideWordIdx] = 1 loss = -np.log(y_hat[0, outsideWordIdx]) gradCenterVec = np.sum((y_hat - y).T * outsideVectors, axis=0, keepdims=True) # (1, d) gradOutsideVecs = (y_hat - y).T * centerWordVec # (V, d) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ loss = 0 gradCenterVec = 0 gradOutsideVecs = 0 x = centerWordVec @ outsideVectors.T out_probs = softmax(x) grad1 = out_probs loss = np.sum(-np.log(grad1)) grad1[outsideWordIdx] -= 1 gradCenterVec = grad1 @ outsideVectors gradOutsideVecs = np.outer(grad1, centerWordVec) ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ # YOUR CODE HERE v_c, u_o, U = centerWordVec, outsideWordIdx, outsideVectors # print('v_c shape {}, U shape {}'.format(v_c.shape, U.shape)) v_c = v_c.reshape(-1, 1) # Transform v_c to a column vector (N, 1) prob = softmax(np.dot(U, v_c).reshape(-1)).reshape(-1, 1) # print("Shpae is ", prob.shape) loss = -np.log(prob[outsideWordIdx]) delta = prob.copy() delta[outsideWordIdx] -= 1 # the true y is a one-hot vector gradCenterVec = np.dot(U.T, delta).flatten() gradOutsideVecs = np.dot(delta, v_c.T) # Please use the provided softmax function (imported earlier in this file) # This numerically stable implementation helps you avoid issues pertaining # to integer overflow. # END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) Note: we usually use column vector convention (i.e., vectors are in column form) for vectors in matrix U and V (in the handout) but for ease of implementation/programming we usually use row vectors (representing vectors in row form). """ ### YOUR CODE HERE y_hat = softmax(np.dot(outsideVectors, centerWordVec)) # U*vc: (w x e) * (e x 1) = (w x 1) loss = -np.log(y_hat[outsideWordIdx]) y = np.zeros(len(outsideVectors)) y[outsideWordIdx] = 1 # one hot vector gradCenterVec = np.dot(outsideVectors.T, y_hat - y) # UT*(y_hat-y): (e x w) * (e x 1) = (e x 1) gradOutsideVecs = np.dot(np.expand_dims(centerWordVec, 1), (np.expand_dims( y_hat - y, 0))).T # vc(y_hat-y)T: (e x 1) * (1 x w) = (e x w) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE scalar_all = np.dot(outsideVectors, centerWordVec) probs = softmax(scalar_all) loss = -np.log(probs[outsideWordIdx]) ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer oveoutsideVectors exponents = np.exp(scalar_all) #print(outsideVectors[outsideWordIdx,:].shape,np.sum(outsideVectors*exponents.reshape((-1,1)), axis = 0).shape) gradCenterVec = -outsideVectors[outsideWordIdx, :] + np.sum( outsideVectors * exponents.reshape( (-1, 1)), axis=0) / np.sum(exponents) gradOutsideVecs = np.dot(exponents.reshape( (-1, 1)), centerWordVec.reshape((1, -1))) / np.sum(exponents) gradOutsideVecs[outsideWordIdx] -= centerWordVec ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. For those unfamiliar with numpy notation, note that a numpy ndarray with a shape of (x, ) is a one-dimensional array, which you can effectively treat as a vector with length x. Arguments: centerWordVec -- numpy ndarray, center word's embedding in shape (word vector length, ) (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors is in shape (num words in vocab, word vector length) for all words in vocab (tranpose of U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector in shape (word vector length, ) (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors in shape (num words in vocab, word vector length) (dJ / dU) """ ### YOUR CODE HERE (~6-8 Lines) print(centerWordVec, outsideWordIdx, outsideVectors, dataset) loss = softmax(centerWordVec, outsideVectors[outsideWordIdx]) ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. theta = outsideVectors.dot(centerWordVec) y_pred = softmax(theta) y_true = np.zeros_like(y_pred) y_true[outsideWordIdx] = 1 loss = -1 * np.log(y_pred[outsideWordIdx]) gradCenterVec = (y_pred - y_true).dot(outsideVectors) gradOutsideVecs = (y_pred - y_true).reshape(y_pred.size, 1).dot( centerWordVec.reshape(1, centerWordVec.size)) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ # softmax applied to v_c * u_o y_hat = softmax(np.dot(centerWordVec, outsideVectors.T)) # loss calculated by -log(y_hat_o) loss = -np.log(y_hat)[outsideWordIdx] # update y_hat_o to get y_hat - y (this holds because y is just a one-hot encoded vector with a 1 for the true outside word) y_hat[outsideWordIdx] -= 1 # U(y_hat - y) # this equation is calculated by finding the partial derivative of J_naive-softmax(v_c, o, U) with respect to v_c gradCenterVec = np.dot(y_hat, outsideVectors) # v_c(y_hat - y)^T # this equation is calculated by finding the partial derivative of J_naive-softmax(v_c, o, U) with respect to U gradOutsideVecs = np.outer(y_hat, centerWordVec) return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. y_hat = np.dot(outsideVectors, centerWordVec) y_softmax = softmax(y_hat) y_ = np.zeros(outsideVectors.shape[0]) y_[outsideWordIdx] = 1 loss = -1 * np.log(y_softmax[outsideWordIdx]) u_o = outsideVectors[outsideWordIdx] #gradCenterVec= np.sum(np.multiply(np.transpose(outsideVectors),y_softmax), axis=1) - u_o gradCenterVec = np.dot(np.transpose(outsideVectors), y_softmax - y_) gradOutsideVecs = np.dot(centerWordVec.reshape(centerWordVec.shape[0], 1), (y_softmax - y_).reshape(1, y_softmax.shape[0])) ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) """ ### YOUR CODE HERE ### Please use the provided softmax function (imported earlier in this file) ### This numerically stable implementation helps you avoid issues pertaining ### to integer overflow. y_hat = softmax(np.dot(centerWordVec.reshape([1, -1]), outsideVectors.T)) # 1xD product DxC => 1xC loss = -np.log(y_hat[0][outsideWordIdx]) # scalar gt = np.zeros(y_hat.shape) gt[0][outsideWordIdx] = 1 # 1xC delta = (y_hat - gt) gradCenterVec = np.dot(delta, outsideVectors) # 1xC product CxD => 1xD gradOutsideVecs = np.dot(delta.T, centerWordVec.reshape( 1, -1)) # Cx1 product 1xD => CxD ### END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector dJ / dv_c = U(y_hat - y) gradOutsideVecs -- the gradient with respect to all the outside word vectors dJ / dU = """ # YOUR CODE HERE # Please use the provided softmax function (imported earlier in this file) # This numerically stable implementation helps you avoid issues pertaining # to integer overflow. y_hat = outsideVectors @ centerWordVec # N y_hat = softmax(y_hat) loss = -np.log(y_hat[outsideWordIdx]) d_val = y_hat # N d_val[outsideWordIdx] -= 1 gradCenterVec = outsideVectors.T @ d_val # U: N * D, d_val: N -> D # d_val: N, vc: D -> N * D gradOutsideVecs = d_val[:, np.newaxis] * centerWordVec # END YOUR CODE return loss, gradCenterVec, gradOutsideVecs
def naiveSoftmaxLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset): """ Naive Softmax loss & gradient function for word2vec models Implement the naive softmax loss and gradients between a center word's embedding and an outside word's embedding. This will be the building block for our word2vec models. Arguments: centerWordVec -- numpy ndarray, center word's embedding (v_c in the pdf handout) outsideWordIdx -- integer, the index of the outside word (o of u_o in the pdf handout) outsideVectors -- outside vectors (rows of matrix) for all words in vocab (U in the pdf handout) dataset -- needed for negative sampling, unused here. Return: loss -- naive softmax loss gradCenterVec -- the gradient with respect to the center word vector (dJ / dv_c in the pdf handout) gradOutsideVecs -- the gradient with respect to all the outside word vectors (dJ / dU) Note: we usually use column vector convention (i.e., vectors are in column form) for vectors in matrix U and V (in the handout) but for ease of implementation/programming we usually use row vectors (representing vectors in row form). """ # Please use the provided softmax function (imported earlier in this file) # This numerically stable implementation helps you avoid issues pertaining # to integer overflow. sm = softmax(outsideVectors.dot(centerWordVec)) loss = -np.log(sm[outsideWordIdx]) gradCenterVec = -outsideVectors[outsideWordIdx] + sm.dot(outsideVectors) gradOutsideVecs = np.zeros_like(outsideVectors) neg_mask = np.arange(gradOutsideVecs.shape[0]) != outsideWordIdx gradOutsideVecs[neg_mask, :] = sm[neg_mask].reshape(-1, 1) * np.tile( centerWordVec, (len(sm[neg_mask]), 1)) gradOutsideVecs[outsideWordIdx] = (sm[outsideWordIdx] - 1) * centerWordVec return loss, gradCenterVec, gradOutsideVecs