logger.info('Size of the data sets: %d' % data_size)
random_index = np.arange(data_size)
np.random.shuffle(random_index)
mr_txt = list(np.asarray(mr_txt)[random_index])
mr_label = list(np.asarray(mr_label)[random_index])
end_time = time.time()
# Record timing
logger.info('Time used to load and shuffle MR dataset: %f seconds.' %
            (end_time - start_time))
# Load word-embedding
embedding_filename = './wiki_embeddings.txt.zip'
# Load training/test data sets and wiki-embeddings.
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation, zero-padding all the sentences to the maximum length.
max_len = 52
mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()),
                    dtype=np.float32)
mr_label = np.asarray(mr_label)[:, np.newaxis]
for i, sent in enumerate(mr_txt):
    words = sent.split()
    words = [word.lower() for word in words]
    l = min(len(words), max_len - 2)
    # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
    mr_insts[i, 1:l + 1, :] = np.asarray(
        [word_embedding.wordvec(word) for word in words[:l]])
    mr_insts[i, 0, :] = mr_insts[i, l + 1, :] = word_embedding.wordvec("</s>")
end_time = time.time()
Пример #2
0
 def testSentimentFineTune(self):
     '''
     Build a small model and use it on sentiment analysis task. With fine-tunning
     the word-embedding matrix.
     '''
     np.random.seed(1991)
     fname = './grCNN.conf'
     configer = GrCNNConfiger(fname)
     senti_train_filename = '../data/sentiment-train.txt'
     # senti_train_filename = '../data/sentiment-train-phrases.txt'
     senti_test_filename = '../data/sentiment-test.txt'
     senti_train_txt, senti_train_label = [], []
     senti_test_txt, senti_test_label = [], []
     start_time = time.time()
     # Read training data set
     with file(senti_train_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_train_txt.append(txt)
             senti_train_label.append(int(label))
     # Read test data set
     with file(senti_test_filename, 'r') as fin:
         reader = csv.reader(fin, delimiter='|')
         for txt, label in reader:
             senti_test_txt.append(txt)
             senti_test_label.append(int(label))
     end_time = time.time()
     logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
     embedding_filename = '../data/wiki_embeddings.txt'
     # Load training/test data sets and wiki-embeddings
     word_embedding = WordEmbedding(embedding_filename)
     embed_dim = word_embedding.embedding_dim()
     start_time = time.time()
     blank_index = word_embedding.word2index('</s>')
     logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
     # Word-vector representation
     senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
     senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
     train_size = len(senti_train_txt)
     test_size = len(senti_test_txt)
     # Check size
     logger.debug('Training size: %d' % train_size)
     logger.debug('Test size: %d' % test_size)
     # Shuffling for all the instances
     start_time = time.time()
     rindex = np.arange(train_size)
     tindex = np.arange(test_size)
     np.random.shuffle(rindex)
     np.random.shuffle(tindex)
     # Shuffle label
     senti_train_label = senti_train_label[rindex]
     senti_test_label = senti_test_label[tindex]
     # Shuffle text
     senti_train_txt = list(np.asarray(senti_train_txt)[rindex])
     senti_test_txt = list(np.asarray(senti_test_txt)[tindex])
     end_time = time.time()
     logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time))
     # Compute word embedding
     senti_train_set = []
     senti_test_set = []
     # Record the index of each word in each sentence for only once
     senti_train_word_index = []
     senti_test_word_index = []
     # Record the sparse input indicator matrix only once for fast computation
     senti_train_sparse_select = []
     senti_test_sparse_select = []
     # Embedding for training set
     for i, sent in enumerate(senti_train_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_train_set.append(vectors)
         senti_train_word_index.append(indices)
         senti_train_sparse_select.append(sparse_select)
     # Embedding for test set
     for i, sent in enumerate(senti_test_txt):
         words = sent.split()
         words = [word.lower() for word in words]
         vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32)
         vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words])
         indices = [blank_index]
         indices += [word_embedding.word2index(word) for word in words]
         indices += [blank_index]
         sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX)
         sparse_select[range(len(words)+2), indices] = 1.0
         sparse_select = csc_matrix(sparse_select)
         senti_test_set.append(vectors)
         senti_test_word_index.append(indices)
         senti_test_sparse_select.append(sparse_select)
     end_time = time.time()
     logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time))
     p_count = np.sum(senti_train_label)
     logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size))
     logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size))
     p_count = np.sum(senti_test_label)
     logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size))
     logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size))
     # Now, start training
     start_time = time.time()
     grbagger = GrCNNBagger(configer, verbose=True)
     end_time = time.time()
     logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time))
     learn_rate = 0.02
     # Training using stochastic gradient descent algorithm
     epoch = 200
     batch_size = 20
     start_time = time.time()
     highest_train_accuracy, highest_test_accuracy = 0.0, 0.0
     track_training_acc, track_training_cost = [], []
     track_test_acc, track_test_cost = [], []
     training_threshold_epoch = 30
     try:
         sample_size = 0
         fuedge_factor = 1e-6
         # accumu matrix for word-embedding matrix
         # hist matrix for word-embedding matrix
         accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX)
         for i in xrange(epoch):
             costs = 0.0
             correct_count = 0
             logger.debug('=' * 50)
             # rate = learn_rate / (1+i)
             rate = learn_rate
             # Training
             num_batch = train_size / batch_size
             for k in xrange(num_batch):
                 # Clear all the cache        
                 accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
                 if i > training_threshold_epoch:
                     accumu_embedding[:] = 0.0
                     hist_embedding[:] = 0.0
                 for j in xrange(k*batch_size, (k+1)*batch_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Updating model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Updating word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 grbagger.update_params(accumu_grads, rate)
             # Clear all the cache again
             accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params]
             if i > training_threshold_epoch:
                 accumu_embedding[:] = 0.0
                 hist_embedding[:] = 0.0
             if num_batch * batch_size < train_size:
                 for j in xrange(num_batch*batch_size, train_size):
                     train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding)
                     results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j])
                     input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j])
                     grads, cost, pred = results[:-2], results[-2], results[-1]
                     if pred == senti_train_label[j]: correct_count += 1
                     costs += cost
                     for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads):
                         accumu_grad += grad
                         hist_grad += np.square(grad)
                     ## Update the word-embedding matrix
                     if i > training_threshold_epoch:
                         tmp = senti_train_sparse_select[j].T.dot(input_grad)
                         accumu_embedding += tmp
                         hist_embedding += np.square(tmp)
                 # Normalizing model parameters
                 for accumu_grad, hist_grad in zip(accumu_grads, hist_grads):
                     accumu_grad /= train_size-num_batch*batch_size
                     accumu_grad /= fuedge_factor + np.sqrt(hist_grad)
                 # Normalizing word-embedding matrix
                 if i > training_threshold_epoch:
                     accumu_embedding /= train_size-num_batch*batch_size
                     accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding)
                     word_embedding._embedding -= rate * accumu_embedding
                 # Updating all the parameters
                 grbagger.update_params(accumu_grads, rate)
             train_accuracy = float(correct_count) / train_size
             logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' 
                         % (i, costs, train_accuracy))
             # Append all the numbers
             track_training_cost.append(costs)
             track_training_acc.append(train_accuracy)
             if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy
             # Testing
             correct_count = 0
             costs = 0.0
             for j in xrange(test_size):
                 test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding)
                 pred = grbagger.predict(test_sent_rep)
                 cost = grbagger.show_cost(test_sent_rep, senti_test_label[j])
                 if pred == senti_test_label[j]: correct_count += 1
                 costs += cost
             test_accuracy = float(correct_count) / test_size
             logger.debug('Test accuracy: %f' % test_accuracy)
             # Append all the numbers
             track_test_cost.append(costs)
             track_test_acc.append(test_accuracy)
             if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy
             # Sampling to show the weights and experts of training and test instances
             logger.debug('Training Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(train_size)
                 weights = grbagger.show_weights(senti_train_set[idx])
                 scores = grbagger.show_scores(senti_train_set[idx])
                 prob = grbagger.show_prob(senti_train_set[idx])
                 label = senti_train_label[idx]
                 logger.debug('Training idx: {}'.format(idx))
                 logger.debug('Training scores: {}'.format(scores))
                 logger.debug('Training weights: {}'.format(weights))
                 logger.debug('Training probability: {}'.format(prob))
                 logger.debug('Training label: {}'.format(label))
                 logger.debug('-' * 50)
             logger.debug('Test Sampling: ')
             for j in xrange(sample_size):
                 idx = np.random.randint(test_size)
                 weights = grbagger.show_weights(senti_test_set[idx])    
                 scores = grbagger.show_scores(senti_test_set[idx])
                 prob = grbagger.show_prob(senti_test_set[idx])
                 label = senti_test_label[idx]
                 logger.debug('Test idx: {}'.format(idx))
                 logger.debug('Test scores: {}'.format(scores))
                 logger.debug('Test weights: {}'.format(weights))
                 logger.debug('Test probability: {}'.format(prob))
                 logger.debug('Test label: {}'.format(label))
                 logger.debug('-' * 50)
             # Check norms of the model parameter
             for param in grbagger.params:
                 val = param.get_value(borrow=True)
                 norm = np.sqrt(np.sum(np.square(val)))
                 logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm))
             wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding)))
             logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm))
     except:
         logger.debug('Error appeared!')
         traceback.print_exc(file=sys.stdout)
         logger.debug('-' * 50)
     finally:
         end_time = time.time()
         logger.debug('Time used for training: %f seconds.' % (end_time-start_time))
         logger.debug('Highest training accuracy: %f' % highest_train_accuracy)
         logger.debug('Highest test accuracy: %f' % highest_test_accuracy)
         GrCNNBagger.save('fine-grbagger.model', grbagger)
         # Save all the tracking numbers
         track_training_acc = np.asarray(track_training_acc)
         track_training_cost = np.asarray(track_training_cost)
         track_test_acc = np.asarray(track_test_acc)
         track_test_cost = np.asarray(track_test_cost)
         with file('fine-senti-records.npy', 'w') as fout:
             np.save(fout, track_training_acc)
             np.save(fout, track_training_cost)
             np.save(fout, track_test_acc)
             np.save(fout, track_test_cost)
         logger.debug('Training and test records saved to fine-senti-records.npy...')
         logger.debug('Finished...')
Пример #3
0
        senti_train_label.append(int(label))
# Read test data set
with file(senti_test_filename, 'r') as fin:
    reader = csv.reader(fin, delimiter='|')
    for txt, label in reader:
        senti_test_txt.append(txt)
        senti_test_label.append(int(label))
end_time = time.time()
logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
# Load word-embedding
embedding_filename = '../data/wiki_embeddings.txt'
# Load training/test data sets and wiki-embeddings
word_embedding = WordEmbedding(embedding_filename)
embed_dim = word_embedding.embedding_dim()
start_time = time.time()
blank_index = word_embedding.word2index('</s>')
logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index)))
# Word-vector representation
senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
train_size = len(senti_train_txt)
test_size = len(senti_test_txt)
# Check size
logger.debug('Training size: %d' % train_size)
logger.debug('Test size: %d' % test_size)
# Shuffling for all the instances
start_time = time.time()
rindex = np.arange(train_size)
tindex = np.arange(test_size)
np.random.shuffle(rindex)
np.random.shuffle(tindex)
Пример #4
0
	def setUp(self):
		'''
		Load training and test texts and labels 
		in sentiment analysis task, preprocessing.
		'''
		np.random.seed(42)
		senti_train_filename = '../data/sentiment-train.txt'
		senti_test_filename = '../data/sentiment-test.txt'
		senti_train_txt, senti_train_label = [], []
		senti_test_txt, senti_test_label = [], []
		start_time = time.time()
		# Record id of words for fine-tuning
		senti_train_words_label, senti_test_words_label = [], []
		# Load Word-Embedding
		embedding_filename = '../data/wiki_embeddings.txt'
		# Load training/test data sets and wiki-embeddings
		word_embedding = WordEmbedding(embedding_filename)
		# Starting and Ending token for each sentence
		self.blank_token = word_embedding.wordvec('</s>')
		self.blank_index = word_embedding.word2index('</s>')
		# Read training data set
		with file(senti_train_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_train_txt.append(txt)
				senti_train_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_train_words_label.append(tmp_indices)
		# Read test data set
		with file(senti_test_filename, 'r') as fin:
			reader = csv.reader(fin, delimiter='|')
			for txt, label in reader:
				senti_test_txt.append(txt)
				senti_test_label.append(int(label))
				words = txt.split()
				words = [word.lower() for word in words]
				tmp_indices = np.zeros(len(words)+2, dtype=np.int32)
				tmp_indices[0] = self.blank_index
				tmp_indices[1:-1] = np.asarray([word_embedding.word2index(word) for word in words])
				tmp_indices[-1] = self.blank_index
				senti_test_words_label.append(tmp_indices)
		end_time = time.time()
		logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time))
		start_time = time.time()
		# Store original word index representation
		self.senti_train_words_label = senti_train_words_label
		self.senti_test_words_label = senti_test_words_label
		# Store original text representation
		self.senti_train_txt = senti_train_txt
		self.senti_test_txt = senti_test_txt
		# Word-vector representation
		self.senti_train_label = np.asarray(senti_train_label, dtype=np.int32)
		self.senti_test_label = np.asarray(senti_test_label, dtype=np.int32)
		train_size = len(senti_train_txt)
		test_size = len(senti_test_txt)
		# Check size
		assert train_size == self.senti_train_label.shape[0]
		assert test_size == self.senti_test_label.shape[0]
		logger.debug('Training size: %d' % train_size)
		logger.debug('Test size: %d' % test_size)
		# Sequential modeling for each sentence
		self.senti_train_set, self.senti_test_set = [], []
		senti_train_len, senti_test_len = [], []
		# Embedding for training set
		for i, sent in enumerate(senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_train_len.append(len(words)+2)
			self.senti_train_set.append(vectors)
		# Embedding for test set
		for i, sent in enumerate(senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words]
			vectors = np.zeros((len(words)+2, word_embedding.embedding_dim()), dtype=floatX)
			vectors[1:-1, :] = np.asarray([word_embedding.wordvec(word) for word in words])
			senti_test_len.append(len(words)+2)
			self.senti_test_set.append(vectors)
		assert senti_train_len == [seq.shape[0] for seq in self.senti_train_set]
		assert senti_test_len == [seq.shape[0] for seq in self.senti_test_set]
		end_time = time.time()
		logger.debug('Time used to build initial training and test matrix: %f seconds.' % (end_time-start_time))
		# Store data
		self.train_size = train_size
		self.test_size = test_size
		self.word_embedding = word_embedding