示例#1
0
def initialize():
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(314)
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    print(tokens)
    json.dump(tokens, open("tokens.json", "w"))

    nWords = len(tokens)
    print(nWords, "word")

    # We are going to train 10-dimensional vectors for this assignment
    dimVectors = 50
    EPOCH = 100

    # Context size
    C = 5
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)
    in_glove = 0
    wordVectors = np.zeros((2 * nWords, dimVectors))

    for i in range(0, nWords):
        if list(tokens.keys())[i] in wv_from_bin.vocab.keys():
            wordVectors[i] = np.array(
                wv_from_bin.word_vec(list(tokens.keys())[i]))
            in_glove += 1
        else:
            wordVectors[i] = (np.random.rand(1, dimVectors) - 0.5) / dimVectors

    for i in range(nWords, 2 * nWords):
        if list(tokens.keys())[i - nWords] in wv_from_bin.vocab.keys():
            wordVectors[i] = np.array(
                wv_from_bin.word_vec(list(tokens.keys())[i - nWords]))

    print(wordVectors)
    print(in_glove, " in GloVe")

    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient),
                      wordVectors,
                      0.3,
                      EPOCH,
                      None,
                      True,
                      PRINT_EVERY=1)
    # Note that normalization is not called here. This is not a bug,
    # normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")

    # concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    print(wordVectors.shape)
    # %%
    np.save("wordVectors", wordVectors)
示例#2
0
def train(dimVectors=10, C=5, lr=0.3):
    random.seed(31415)
    np.random.seed(9265)
    startTime=time.time()

    #init word vectors of size 2VxD #center, outside vecs
    wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors)-0.5)/dimVectors,  
                                   np.zeros((nWords, dimVectors))),
                                   axis=0)
    #sgd(f, x0, step, iterations, postprocessing=None,useSaved=False, PRINT_EVERY=10):
    wordVectors = sgd(
                    lambda vec: word2vec_sgd_wrapper(skipgram, word2Ind, vec, dataset, 
                                        C, negSamplingLossAndGradient),
                    wordVectors, 
                    lr=0.3, 
                    iterations=40000, 
                    postprocessing=None, 
                    useSaved=True, 
                    PRINT_EVERY=10
                    )

    print("sanity check: cost at convergence should be around or below 10")
    print("training took %d seconds" % (time.time() - startTime))
示例#3
0
# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
     np.zeros((nWords, dimVectors))),
    axis=0)
print(nWords)
print(wordVectors.shape)
wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
    skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient),
                  wordVectors,
                  0.3,
                  40000,
                  None,
                  True,
                  PRINT_EVERY=10)
print(wordVectors.shape)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.

print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - startTime))

# concatenate the input and output word vectors
wordVectors = np.concatenate(
    (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)

visualizeWords = [
示例#4
0
def train_wordvector(dimVectors, C=5, treebank=StanfordSentiment):
    '''
    Train a word vector from the stanford sentiment treebank
    :param dimVectors: the dimension of the word vector to train, default 10
    :param C:
    :return:
    '''

    # Reset the random seed to make sure that everyone gets the same results
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(314)
    dataset = treebank()
    tokens = dataset.tokens()
    nWords = len(tokens)

    random.seed(31415)
    np.random.seed(9265)

    startTime = time.time()
    wordVectors = np.concatenate(
        ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
         np.zeros((nWords, dimVectors))),
        axis=0)

    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient),
                      wordVectors,
                      0.3,
                      40000,
                      "saved_params",
                      None,
                      True,
                      PRINT_EVERY=10)

    # Note that normalization is not called here. This is not a bug,
    # normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")
    print("training took %d seconds" % (time.time() - startTime))

    # concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    # wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

    visualizeWords = [
        "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good",
        "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
        "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"
    ]

    visualizeIdx = [tokens[word] for word in visualizeWords]
    visualizeVecs = wordVectors[visualizeIdx, :]
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:, 0:2])

    for i in range(len(visualizeWords)):
        plt.text(coord[i, 0],
                 coord[i, 1],
                 visualizeWords[i],
                 bbox=dict(facecolor='green', alpha=0.1))

    plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
    plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

    plt.savefig('q3_word_vectors.png')
示例#5
0
文件: neural_lm.py 项目: oriyor/NLP
    dimensions = [input_dim, hidden_dim, output_dim]
    params = np.random.randn(
        (input_dim + 1) * hidden_dim + (hidden_dim + 1) * output_dim, )
    print "#params: " + str(len(params))
    print "#train examples: " + str(num_of_examples)

    # run SGD

    # This optimization was added here so that we do not convert
    # our data structure on each iteration
    num_to_word_embedding = np.array(num_to_word_embedding)
    in_word_index = np.array(in_word_index)
    out_word_index = np.array(out_word_index)

    params = sgd(
        lambda vec: lm_wrapper(in_word_index, out_word_index,
                               num_to_word_embedding, dimensions, vec), params,
        LEARNING_RATE, NUM_OF_SGD_ITERATIONS, None, True, 1000)

    print "training took %d seconds" % (time.time() - startTime)

    # Evaluate perplexity with dev-data
    perplexity = eval_neural_lm('data/lm/ptb-dev.txt')
    print "dev perplexity : " + str(perplexity)

    # Evaluate perplexity with test-data (only at test time!)
    if os.path.exists('data/lm/ptb-test.txt'):
        perplexity = eval_neural_lm('data/lm/ptb-test.txt')
        print "test perplexity : " + str(perplexity)
    else:
        print "test perplexity will be evaluated only at test time!"
示例#6
0
dimVectors = int(sys.argv[2])

random.seed(314)
config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
configPath = 'bi-config.ini'
config.read(configPath)

for section in config.sections():
    if not section==sys.argv[1]: continue

    dataset = Causal(configPath, section)
    causenWords = len(dataset.causeprior)
    effectnWords = len(dataset.effectprior)

    datasets_dir = config.get(section, "datasets_dir")

    # context size
    C = 5
    params_dir = datasets_dir + "/dim=" + str(dimVectors)
    if not os.path.exists(params_dir): os.system("mkdir " + params_dir)
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)
    wordVectors = np.concatenate(((np.random.rand(causenWords, dimVectors) - .5) / \
        dimVectors, np.zeros((effectnWords, dimVectors))), axis=0)
    wordVectors0 = sgd(
        lambda vec: word2vec_sgd_wrapper(cskipgram, vec, dataset, C,
        	negSamplingCostAndGradient),
        wordVectors, params_dir, 0.3, 100000, None, True, PRINT_EVERY=100)
    print("sanity check: cost at convergence should be around or below 10")
示例#7
0
文件: train.py 项目: 124399839/DLNLP
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Train word vectors (this could take a while!)

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)
wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors, 
                              np.zeros((nWords, dimVectors))), axis=0)
wordVectors0 = sgd(lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), 
                   wordVectors, 0.3, 40000, None, False, PRINT_EVERY=10)
# sanity check: cost at convergence should be around or below 10

# sum the input and output word vectors
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])

print "\n=== For autograder ==="
checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"]
checkIdx = [tokens[word] for word in checkWords]
checkVecs = wordVectors[checkIdx, :]
print checkVecs


示例#8
0
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
     np.zeros((nWords, dimVectors))),
    axis=0)

if cfg.normalization:
    postprocess = lambda U: U / (np.sqrt(np.sum(U * U, axis=1)) + 1e-6
                                 ).reshape(-1, 1)
else:
    postprocess = None

if cfg.negSample:
    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient),
                      wordVectors,
                      0.3,
                      cfg.max_iteration,
                      postprocess,
                      True,
                      PRINT_EVERY=10)
else:
    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram,
        tokens,
        vec,
        dataset,
        C,
        word2vecLossAndGradient=naiveSoftmaxLossAndGradient),
                      wordVectors,
                      0.3,
                      cfg.max_iteration,
                      postprocess,
示例#9
0
    random.seed(3141)
    np.random.seed(59265)
    weights = np.random.randn(dimVectors, 5)

    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)

    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
        #print trainFeatures[i,:]
    # We will do batch optimization
    weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)

    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
    
    _, _, pred = softmaxRegression(devFeatures, devLabels, weights)
    print "Dev precision (%%): %f" % precision(devLabels, pred)

示例#10
0
# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
# random.seed(31415)
# np.random.seed(9265)

start_time = time.time()
word_vectors = np.concatenate(
    ((np.random.rand(n_words, dim_vectors) - 0.5) / dim_vectors,
     np.zeros((n_words, dim_vectors))),
    axis=0)
word_vectors = sgd(lambda vec: word2vec_sgd_wrapper(
    skipgram, tokens, vec, dataset, C, naive_softmax_loss_and_gradient),
                   word_vectors,
                   0.3,
                   40000,
                   None,
                   False,
                   PRINT_EVERY=10)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.

print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - start_time))

# concatenate the input and output word vectors
word_vectors = np.concatenate(
    (word_vectors[:n_words, :], word_vectors[n_words:, :]), axis=0)

visualize_words = [
    "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
示例#11
0
    np.random.seed(59265)
    weights = np.random.randn(dimVectors, 5)

    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)

    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
        #print trainFeatures[i,:]
    # We will do batch optimization
    weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels,
                                                  weights, regularization),
                  weights,
                  3.0,
                  10000,
                  PRINT_EVERY=100)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)

    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

    _, _, pred = softmaxRegression(devFeatures, devLabels, weights)
    print "Dev precision (%%): %f" % precision(devLabels, pred)
示例#12
0
# np.random.seed(9265)

startTime = time.time()

# 意思是 input vector是随机启动, output vector是zeros
# randomStartVector = (np.random.rand(nWords, dimVectors) - 0.5)
# zerosVector = np.zeros((nWords, dimVectors))
# wordVectors = np.concatenate((randomStartVector/dimVectors, zerosVector),axis=0)
# print(wordVectors)

# 训练,关键
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
                                     negSamplingCostAndGradient),
    wordVectors,
    0.3,  # step 0.3
    2000,  # iteration: 40000
    None,
    True,
    PRINT_EVERY=10)

# 这里是什么鬼,又拼接回去??? 我把它倒过来
# concatenate the input and output word vectors
# print('wordVectors before')
# print(wordVectors)
# wordVectors = np.concatenate(
#     (wordVectors[:nWords,:], wordVectors[nWords:,:]),
#     axis=0)
wordVectors = np.concatenate(
    (wordVectors[nWords:, :], wordVectors[:nWords, :]), axis=0)
示例#13
0
文件: run.py 项目: 7n42Oaq1/cs224n
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
     np.zeros((nWords, dimVectors))),
    axis=0)
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient
    ),  # f = lambda, x = wordVectors, lambda calls another function word2vec_sgd_wapper by passing parameters where vec is wordVectors
    wordVectors,
    0.3,
    40000,
    None,
    True,
    PRINT_EVERY=10)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.

print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - startTime))

# concatenate the input and output word vectors
wordVectors = np.concatenate(
    (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)

visualizeWords = [