Python WordEmbedding.word2index примеры использования

Язык программирования: Python

Пространство имен/Пакет: wordvec

Класс/Тип: WordEmbedding

Метод/Функция: word2index

Примеров на hotexamples.com: 4

Python WordEmbedding.word2index - 4 примера найдено. Это лучшие примеры Python кода для wordvec.WordEmbedding.word2index, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

wordvec(15)

embedding_dim(14)

WordEmbedding(11)

word2index(4)

dict_size(3)

index2word(2)

save(1)

Пример #1

Показать файл

Файл: rnn_sentiment.py Проект: alphafan/NLP-DeepLearning-Experiments

logger.info('Size of the data sets: %d' % data_size)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt.zip'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = min(len(words), max_len - 2)
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1:l + 1, :] = np.asarray(
        [word_embedding.wordvec(word) for word in words[:l]])
    mr_insts[i, 0, :] = mr_insts[i, l + 1, :] = word_embedding.wordvec("</s>")
end_time = time.time()

Пример #2

Показать файл

 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')

Пример #3

Показать файл

Файл: movie_sentiment.py Проект: appscluster/sentiment-CNN

        senti_train_label.append(int(label))
# Read test data set
with file(senti_test_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_test_txt.append(txt)
        senti_test_label.append(int(label))
end_time = time.time()
logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
# Load word-embedding
embedding_filename = '../data/wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation
senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
train_size = len(senti_train_txt)
test_size = len(senti_test_txt)
# Check size
logger.debug('Training size: %d' % train_size)
logger.debug('Test size: %d' % test_size)
# Shuffling for all the instances
start_time = time.time()
rindex = np.arange(train_size)
tindex = np.arange(test_size)
np.random.shuffle(rindex)
np.random.shuffle(tindex)

Пример #4

Показать файл

	def setUp(self):
		'''
		Load training and test texts and labels 
		in sentiment analysis task, preprocessing.
		'''
		np.random.seed(42)
		senti_train_filename = '../data/sentiment-train.txt'
		senti_test_filename = '../data/sentiment-test.txt'
		senti_train_txt, senti_train_label = [], []
		senti_test_txt, senti_test_label = [], []
		start_time = time.time()
		# Record id of words for fine-tuning
		senti_train_words_label, senti_test_words_label = [], []
		# Load Word-Embedding
		embedding_filename = '../data/wiki_embeddings.txt'
		# Load training/test data sets and wiki-embeddings
		word_embedding = WordEmbedding(embedding_filename)
		# Starting and Ending token for each sentence
		self.blank_token = word_embedding.wordvec('</s>')
		self.blank_index = word_embedding.word2index('</s>')
		# Read training data set
		with file(senti_train_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_train_txt.append(txt)
				senti_train_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_train_words_label.append(tmp_indices)
		# Read test data set
		with file(senti_test_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_test_txt.append(txt)
				senti_test_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_test_words_label.append(tmp_indices)
		end_time = time.time()
		logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
		start_time = time.time()
		# Store original word index representation
		self.senti_train_words_label = senti_train_words_label
		self.senti_test_words_label = senti_test_words_label
		# Store original text representation
		self.senti_train_txt = senti_train_txt
		self.senti_test_txt = senti_test_txt
		# Word-vector representation
		self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
		self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
		train_size = len(senti_train_txt)
		test_size = len(senti_test_txt)
		# Check size
		assert train_size == self.senti_train_label.shape[0]
		assert test_size == self.senti_test_label.shape[0]
		logger.debug('Training size: %d' % train_size)
		logger.debug('Test size: %d' % test_size)
		# Sequential modeling for each sentence
		self.senti_train_set, self.senti_test_set = [], []
		senti_train_len, senti_test_len = [], []
		# Embedding for training set
		for i, sent in enumerate(senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_train_len.append(len(words)+2)
			self.senti_train_set.append(vectors)
		# Embedding for test set
		for i, sent in enumerate(senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_test_len.append(len(words)+2)
			self.senti_test_set.append(vectors)
		assert senti_train_len == [seq.shape[0] for seq in self.senti_train_set]
		assert senti_test_len == [seq.shape[0] for seq in self.senti_test_set]
		end_time = time.time()
		logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
		# Store data
		self.train_size = train_size
		self.test_size = test_size
		self.word_embedding = word_embedding