Python WordEmbedding.index2word примеры использования

Язык программирования: Python

Пространство имен/Пакет: wordvec

Класс/Тип: WordEmbedding

Метод/Функция: index2word

Примеров на hotexamples.com: 3

Python WordEmbedding.index2word - 3 примера найдено. Это лучшие примеры Python кода для wordvec.WordEmbedding.index2word, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

wordvec(15)

embedding_dim(14)

WordEmbedding(11)

word2index(4)

dict_size(3)

index2word(2)

save(1)

Пример #1

Показать файл

Файл: movie_sentiment.py Проект: appscluster/sentiment-CNN

# Read test data set
with file(senti_test_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_test_txt.append(txt)
        senti_test_label.append(int(label))
end_time = time.time()
logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
# Load word-embedding
embedding_filename = '../data/wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation
senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
train_size = len(senti_train_txt)
test_size = len(senti_test_txt)
# Check size
logger.debug('Training size: %d' % train_size)
logger.debug('Test size: %d' % test_size)
# Shuffling for all the instances
start_time = time.time()
rindex = np.arange(train_size)
tindex = np.arange(test_size)
np.random.shuffle(rindex)
np.random.shuffle(tindex)
# Shuffle label

Пример #2

Показать файл

Файл: rnn_sentiment.py Проект: alphafan/NLP-DeepLearning-Experiments

random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt.zip'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = min(len(words), max_len - 2)
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1:l + 1, :] = np.asarray(
        [word_embedding.wordvec(word) for word in words[:l]])
    mr_insts[i, 0, :] = mr_insts[i, l + 1, :] = word_embedding.wordvec("</s>")
end_time = time.time()
logger.info(

Пример #3

Показать файл

 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')