def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient sampleIndices = [sampleTokenIdxNoTarget(target, dataset) for _ in range(K)] grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) cost = 0.0 sig_positive = sigmoid(np.dot(outputVectors[target], predicted)) cost += -np.log(sig_positive) gradPred += (sig_positive - 1) * outputVectors[target] grad[target] += (sig_positive - 1) * predicted for idx in sampleIndices: sig_neg = sigmoid(-np.dot(outputVectors[idx], predicted)) cost -= np.log(sig_neg) gradPred -= (sig_neg - 1) * outputVectors[idx] grad[idx] += -(sig_neg - 1) * predicted return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE y = sigmoid(outputVectors[target].dot(predicted)) cost = -np.log(y) gradPred = (y - 1) * outputVectors[target] grad = np.zeros(outputVectors.shape) grad[target] = (y - 1) * predicted for i in range(K): index = dataset.sampleTokenIdx() while index == target: index = dataset.sampleTokenIdx() y_k = sigmoid(np.dot(-outputVectors[index], predicted)) cost -= np.log(y_k) gradPred += (1 - y_k) * outputVectors[index] grad[index] += (1 - y_k) * predicted # raise NotImplementedError ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE #产生取样的index # index=[dataset.sampleTokenIdx() for k in range(K)] # out_vec=outputVectors[target,:] # in_vec=predicted # sigma=sigmoid(np.dot(out_vec,in_vec)) #把softmax换成sigmoid函数 # cost=-np.log(sigma) # gradPred=out_vec*(sigma-1) # grad=np.zeros(outputVectors.shape) # # for i in range(K): # k_vec=outputVectors[index[i],:] # sigma2=sigmoid(-np.dot(k_vec,in_vec)) # cost=cost-np.log(sigma2) # gradPred=gradPred+k_vec*(1-sigma2) # grad[index[i]]+=in_vec*(1-sigma2) # # grad[target,:]=grad[target,:]+in_vec*(sigma-1) index = [dataset.sampleTokenIdx() for k in range(K)] u_o = outputVectors[target, :] v_c = predicted u_k = outputVectors[index, :] sigma1 = sigmoid(np.dot(u_o, v_c)) sigma2 = sigmoid(-np.dot(u_k, v_c)) grad = np.zeros(outputVectors.shape) cost = -np.log(sigma1) - np.sum(np.log(sigma2)) gradPred = u_o * (sigma1 - 1) + np.dot((1 - sigma2).T, u_k) temp = np.outer(1 - sigma2, v_c) for i in range(K): grad[index[i], :] += temp[i] grad[target, :] += v_c * (sigma1 - 1) ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE # initiate gradient for other (neg) samples and predicted sample grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) # calculate useful sigmoid value and 1-sigmoid value s = sigmoid(predicted.dot(outputVectors[target, :])) t = 1 - s # initiate cost function and renew the 1st step (pos sample) for grads cost = -np.log(s) # cost = -yilog(s) gradPred -= t * outputVectors[target, :] grad[target, :] -= t * predicted # sample K times for neg samples for k in range(K): neg = dataset.sampleTokenIdx() # here 1 - Sigmoid(x) = Sigmoid(-x) s = sigmoid(-predicted.dot(outputVectors[neg, :])) t = 1 - s cost += -np.log(s) # cost = -sum((1-yi)log(s)), yi=0 gradPred += t * outputVectors[neg, :] # only neg grads will renew grad[neg, :] += t * predicted ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) sample_indices = [] for k in range(K): index = dataset.sampleTokenIdx() while index == target: index = dataset.sampleTokenIdx() sample_indices.append(index) y_predict = sigmoid(outputVectors[target, :].dot(predicted)) outputVectors_k = outputVectors[sample_indices, :] y_predict_negative = sigmoid(-outputVectors_k.dot(predicted)) cost = -np.log(y_predict) - np.sum(np.log(y_predict_negative)) sum_k = (y_predict_negative - 1).dot(outputVectors_k) gradPred = (y_predict - 1) * outputVectors[target, :] - sum_k grad_neg_out = -np.outer(y_predict_negative - 1, predicted) for k in range(K): grad[sample_indices[k]] += grad_neg_out[k] grad[target, :] += (y_predict - 1) * predicted #raise NotImplementedError ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE #raise NotImplementedError expected = outputVectors[target,:] #u_o grad = np.zeros(outputVectors.shape) tmp = sigmoid(np.dot(expected, predicted)) cost = 0 - np.log(tmp) gradPred = 0 - (1 - tmp) * expected grad[target,:] = 0 - (1 - tmp) * predicted #negative sampling cnt = 0 while True: idx = dataset.sampleTokenIdx() if idx == target: continue cnt += 1 tmp = sigmoid(0 - np.dot(outputVectors[idx,:], predicted)) cost -= np.log(tmp) gradPred += (1 - tmp) * outputVectors[idx,:] grad[idx,:] += (1 - tmp) * predicted if cnt == K: break ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPre = np.zeros(predicted.shape) indices = [target] for k in range(K): newindex = dataset.sampleTokenIdx() while newindex == target: newindex = dataset.sampleTokenIdx() indices += [newindex] labels = np.array([1] + [-1 for k in range(K)]) vecs = outputVectors[indices, :] t = sigmoid(vecs.dot(predicted) * labels) cost = -np.sum(np.log(t)) delta = labels * (t - 1) gradPred = delta.reshape((1, K + 1)).dot(vecs).flatten() gradtemp = delta.reshape((K + 1, 1)).dot(predicted.reshape( (1, predicted.shape[0]))) for k in range(K + 1): grad[indices[k]] += gradtemp[k, :] ##raise NotImplementedError ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # YOUR CODE HERE gradPred = np.zeros(predicted.shape) grad = np.zeros(outputVectors.shape) new_index = dataset.sampleTokenIdx() indices = [target] # print(new_index) for k in range(K): new_index = dataset.sampleTokenIdx() while new_index == target: new_index = dataset.sampleTokenIdx() indices += [new_index] # print(indices) sampling_labels = np.array([1] + [-1 for k in range(K)]) # print(sampling_labels) vec_out = outputVectors[indices, :] # print(vec_out) likelihood = sigmoid(vec_out.dot(predicted) * sampling_labels) difference = sampling_labels * (likelihood - 1) gradPred = np.dot(difference.reshape((1, K + 1)), vec_out) gradPred = gradPred.flatten() tokens_len = predicted.shape[0] gradtemp = np.dot(difference.reshape((K + 1, 1)), predicted.reshape((1, tokens_len))) cost = -np.sum(np.log(likelihood)) for k in range(K + 1): grad[indices[k]] += gradtemp[k, :] likelihood = sigmoid(predicted.dot(outputVectors[target, :])) difference = likelihood - 1 grad[target, :] += difference * predicted gradPred += difference * outputVectors[target, :] cost = -np.log(likelihood) for k in range(K): index = dataset.sampleTokenIdx() temp_likelihood = np.dot(-predicted, outputVectors[index, :]) likelihood = sigmoid(temp_likelihood) cost += -np.log(likelihood) difference = 1 - likelihood grad[index, :] += difference * predicted gradPred += difference * outputVectors[index, :] # raise NotImplementedError # END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient ### YOUR CODE HERE # raise NotImplementedError indices = [dataset.sampleTokenIdx() for k in range(K)] # generate sample indexes u_o = outputVectors[ target, :] # target means "o" here # notice the array is stored by row v_c = predicted ### Method1: A slow and inefficient method using "for" loop ### average time: 0.000195302985752s # sigma1 = sigmoid(np.dot(u_o, v_c)) # cost = -np.log(sigma1) # neg-sample cost # gradPred = u_o * (sigma1 - 1) # the gradient with respect to v_c # grad = np.zeros(outputVectors.shape) # initialize grad # # for i in range(K): # u_k = outputVectors[indices[i], :] # sigma2 = sigmoid(-np.dot(u_k, v_c)) # cost = cost - np.log(sigma2) # gradPred = gradPred + u_k * (1 - sigma2) # the gradient with respect to v_c # grad[indices[i]] += v_c * (1 - sigma2) # the gradient with respect to u_k (and k!=o) # # grad[target, :] = grad[target, :] + v_c * (sigma1 - 1) # pay attention to the grad of target word o u_k = outputVectors[indices, :] sigma1 = sigmoid(np.dot(u_o, v_c)) sigma2 = sigmoid(-np.dot(u_k, v_c)) grad = np.zeros(outputVectors.shape) cost = -np.log(sigma1) - np.sum(np.log(sigma2)) # neg-sample cost gradPred = u_o * (sigma1 - 1) + np.dot( (1 - sigma2).T, u_k) # the gradient with respect to v_c ### Method2: Trying to use some matrix operating to replace "for" loop ### average time: 0.000179892113867s ### Keep going! It only saved us about 10% time! # for i in range(K): # grad[indices[i], :] += v_c * (1 - sigma2)[i] ### Method3: using np.outer! ### average time: 0.000135194817627s ### Amazing! It has saved us about 30% time! temp = np.outer(1 - sigma2, v_c) for i in range(K): grad[indices[i], :] += temp[i] ### Method4: I tried to avoid "for" loop, however I failed. :( ### because there are replicate values in indices, hence we can't update them once # grad[indices, :] += np.tile(v_c, [len(sigma2), 1]) * (1 - sigma2)[:, None] grad[target, :] += v_c * (sigma1 - 1 ) # pay attention to the grad of target word o ### END YOUR CODE return cost, gradPred, grad