def initialize(): # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() print(tokens) json.dump(tokens, open("tokens.json", "w")) nWords = len(tokens) print(nWords, "word") # We are going to train 10-dimensional vectors for this assignment dimVectors = 50 EPOCH = 100 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) in_glove = 0 wordVectors = np.zeros((2 * nWords, dimVectors)) for i in range(0, nWords): if list(tokens.keys())[i] in wv_from_bin.vocab.keys(): wordVectors[i] = np.array( wv_from_bin.word_vec(list(tokens.keys())[i])) in_glove += 1 else: wordVectors[i] = (np.random.rand(1, dimVectors) - 0.5) / dimVectors for i in range(nWords, 2 * nWords): if list(tokens.keys())[i - nWords] in wv_from_bin.vocab.keys(): wordVectors[i] = np.array( wv_from_bin.word_vec(list(tokens.keys())[i - nWords])) print(wordVectors) print(in_glove, " in GloVe") wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient), wordVectors, 0.3, EPOCH, None, True, PRINT_EVERY=1) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) print(wordVectors.shape) # %% np.save("wordVectors", wordVectors)
def train(dimVectors=10, C=5, lr=0.3): random.seed(31415) np.random.seed(9265) startTime=time.time() #init word vectors of size 2VxD #center, outside vecs wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors)-0.5)/dimVectors, np.zeros((nWords, dimVectors))), axis=0) #sgd(f, x0, step, iterations, postprocessing=None,useSaved=False, PRINT_EVERY=10): wordVectors = sgd( lambda vec: word2vec_sgd_wrapper(skipgram, word2Ind, vec, dataset, C, negSamplingLossAndGradient), wordVectors, lr=0.3, iterations=40000, postprocessing=None, useSaved=True, PRINT_EVERY=10 ) print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime))
# Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) print(nWords) print(wordVectors.shape) wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) print(wordVectors.shape) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) visualizeWords = [
def train_wordvector(dimVectors, C=5, treebank=StanfordSentiment): ''' Train a word vector from the stanford sentiment treebank :param dimVectors: the dimension of the word vector to train, default 10 :param C: :return: ''' # Reset the random seed to make sure that everyone gets the same results # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = treebank() tokens = dataset.tokens() nWords = len(tokens) random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, "saved_params", None, True, PRINT_EVERY=10) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) # wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:] visualizeWords = [ "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying" ] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('q3_word_vectors.png')
dimensions = [input_dim, hidden_dim, output_dim] params = np.random.randn( (input_dim + 1) * hidden_dim + (hidden_dim + 1) * output_dim, ) print "#params: " + str(len(params)) print "#train examples: " + str(num_of_examples) # run SGD # This optimization was added here so that we do not convert # our data structure on each iteration num_to_word_embedding = np.array(num_to_word_embedding) in_word_index = np.array(in_word_index) out_word_index = np.array(out_word_index) params = sgd( lambda vec: lm_wrapper(in_word_index, out_word_index, num_to_word_embedding, dimensions, vec), params, LEARNING_RATE, NUM_OF_SGD_ITERATIONS, None, True, 1000) print "training took %d seconds" % (time.time() - startTime) # Evaluate perplexity with dev-data perplexity = eval_neural_lm('data/lm/ptb-dev.txt') print "dev perplexity : " + str(perplexity) # Evaluate perplexity with test-data (only at test time!) if os.path.exists('data/lm/ptb-test.txt'): perplexity = eval_neural_lm('data/lm/ptb-test.txt') print "test perplexity : " + str(perplexity) else: print "test perplexity will be evaluated only at test time!"
dimVectors = int(sys.argv[2]) random.seed(314) config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation()) configPath = 'bi-config.ini' config.read(configPath) for section in config.sections(): if not section==sys.argv[1]: continue dataset = Causal(configPath, section) causenWords = len(dataset.causeprior) effectnWords = len(dataset.effectprior) datasets_dir = config.get(section, "datasets_dir") # context size C = 5 params_dir = datasets_dir + "/dim=" + str(dimVectors) if not os.path.exists(params_dir): os.system("mkdir " + params_dir) # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) wordVectors = np.concatenate(((np.random.rand(causenWords, dimVectors) - .5) / \ dimVectors, np.zeros((effectnWords, dimVectors))), axis=0) wordVectors0 = sgd( lambda vec: word2vec_sgd_wrapper(cskipgram, vec, dataset, C, negSamplingCostAndGradient), wordVectors, params_dir, 0.3, 100000, None, True, PRINT_EVERY=100) print("sanity check: cost at convergence should be around or below 10")
tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Train word vectors (this could take a while!) # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors0 = sgd(lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, False, PRINT_EVERY=10) # sanity check: cost at convergence should be around or below 10 # sum the input and output word vectors wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) print "\n=== For autograder ===" checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"] checkIdx = [tokens[word] for word in checkWords] checkVecs = wordVectors[checkIdx, :] print checkVecs
wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) if cfg.normalization: postprocess = lambda U: U / (np.sqrt(np.sum(U * U, axis=1)) + 1e-6 ).reshape(-1, 1) else: postprocess = None if cfg.negSample: wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient), wordVectors, 0.3, cfg.max_iteration, postprocess, True, PRINT_EVERY=10) else: wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, word2vecLossAndGradient=naiveSoftmaxLossAndGradient), wordVectors, 0.3, cfg.max_iteration, postprocess,
random.seed(3141) np.random.seed(59265) weights = np.random.randn(dimVectors, 5) trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) #print trainFeatures[i,:] # We will do batch optimization weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) _, _, pred = softmaxRegression(devFeatures, devLabels, weights) print "Dev precision (%%): %f" % precision(devLabels, pred)
# Context size C = 5 # Reset the random seed to make sure that everyone gets the same results # random.seed(31415) # np.random.seed(9265) start_time = time.time() word_vectors = np.concatenate( ((np.random.rand(n_words, dim_vectors) - 0.5) / dim_vectors, np.zeros((n_words, dim_vectors))), axis=0) word_vectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, naive_softmax_loss_and_gradient), word_vectors, 0.3, 40000, None, False, PRINT_EVERY=10) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - start_time)) # concatenate the input and output word vectors word_vectors = np.concatenate( (word_vectors[:n_words, :], word_vectors[n_words:, :]), axis=0) visualize_words = [ "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
np.random.seed(59265) weights = np.random.randn(dimVectors, 5) trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) #print trainFeatures[i,:] # We will do batch optimization weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) _, _, pred = softmaxRegression(devFeatures, devLabels, weights) print "Dev precision (%%): %f" % precision(devLabels, pred)
# np.random.seed(9265) startTime = time.time() # 意思是 input vector是随机启动, output vector是zeros # randomStartVector = (np.random.rand(nWords, dimVectors) - 0.5) # zerosVector = np.zeros((nWords, dimVectors)) # wordVectors = np.concatenate((randomStartVector/dimVectors, zerosVector),axis=0) # print(wordVectors) # 训练,关键 wordVectors = sgd( lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, # step 0.3 2000, # iteration: 40000 None, True, PRINT_EVERY=10) # 这里是什么鬼,又拼接回去??? 我把它倒过来 # concatenate the input and output word vectors # print('wordVectors before') # print(wordVectors) # wordVectors = np.concatenate( # (wordVectors[:nWords,:], wordVectors[nWords:,:]), # axis=0) wordVectors = np.concatenate( (wordVectors[nWords:, :], wordVectors[:nWords, :]), axis=0)
C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors = sgd( lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient ), # f = lambda, x = wordVectors, lambda calls another function word2vec_sgd_wapper by passing parameters where vec is wordVectors wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) visualizeWords = [