# Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) # Load word-embedding embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation senti_train_label = np.asarray(senti_train_label, dtype=np.int32) senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Shuffling for all the instances start_time = time.time() rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) # Shuffle label
random_index = np.arange(data_size) np.random.shuffle(random_index) mr_txt = list(np.asarray(mr_txt)[random_index]) mr_label = list(np.asarray(mr_label)[random_index]) end_time = time.time() # Record timing logger.info('Time used to load and shuffle MR dataset: %f seconds.' % (end_time - start_time)) # Load word-embedding embedding_filename = './wiki_embeddings.txt.zip' # Load training/test data sets and wiki-embeddings. word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.info('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation, zero-padding all the sentences to the maximum length. max_len = 52 mr_insts = np.zeros((data_size, max_len, word_embedding.embedding_dim()), dtype=np.float32) mr_label = np.asarray(mr_label)[:, np.newaxis] for i, sent in enumerate(mr_txt): words = sent.split() words = [word.lower() for word in words] l = min(len(words), max_len - 2) # vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) mr_insts[i, 1:l + 1, :] = np.asarray( [word_embedding.wordvec(word) for word in words[:l]]) mr_insts[i, 0, :] = mr_insts[i, l + 1, :] = word_embedding.wordvec("</s>") end_time = time.time() logger.info(
def testSentimentFineTune(self): ''' Build a small model and use it on sentiment analysis task. With fine-tunning the word-embedding matrix. ''' np.random.seed(1991) fname = './grCNN.conf' configer = GrCNNConfiger(fname) senti_train_filename = '../data/sentiment-train.txt' # senti_train_filename = '../data/sentiment-train-phrases.txt' senti_test_filename = '../data/sentiment-test.txt' senti_train_txt, senti_train_label = [], [] senti_test_txt, senti_test_label = [], [] start_time = time.time() # Read training data set with file(senti_train_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_train_txt.append(txt) senti_train_label.append(int(label)) # Read test data set with file(senti_test_filename, 'r') as fin: reader = csv.reader(fin, delimiter='|') for txt, label in reader: senti_test_txt.append(txt) senti_test_label.append(int(label)) end_time = time.time() logger.debug('Time used to load training and test data set: %f seconds.' % (end_time-start_time)) embedding_filename = '../data/wiki_embeddings.txt' # Load training/test data sets and wiki-embeddings word_embedding = WordEmbedding(embedding_filename) embed_dim = word_embedding.embedding_dim() start_time = time.time() blank_index = word_embedding.word2index('</s>') logger.debug('Blank index: {}'.format(word_embedding.index2word(blank_index))) # Word-vector representation senti_train_label = np.asarray(senti_train_label, dtype=np.int32) senti_test_label = np.asarray(senti_test_label, dtype=np.int32) train_size = len(senti_train_txt) test_size = len(senti_test_txt) # Check size logger.debug('Training size: %d' % train_size) logger.debug('Test size: %d' % test_size) # Shuffling for all the instances start_time = time.time() rindex = np.arange(train_size) tindex = np.arange(test_size) np.random.shuffle(rindex) np.random.shuffle(tindex) # Shuffle label senti_train_label = senti_train_label[rindex] senti_test_label = senti_test_label[tindex] # Shuffle text senti_train_txt = list(np.asarray(senti_train_txt)[rindex]) senti_test_txt = list(np.asarray(senti_test_txt)[tindex]) end_time = time.time() logger.debug('Time used to shuffle all the data: %f seconds.' % (end_time-start_time)) # Compute word embedding senti_train_set = [] senti_test_set = [] # Record the index of each word in each sentence for only once senti_train_word_index = [] senti_test_word_index = [] # Record the sparse input indicator matrix only once for fast computation senti_train_sparse_select = [] senti_test_sparse_select = [] # Embedding for training set for i, sent in enumerate(senti_train_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_train_set.append(vectors) senti_train_word_index.append(indices) senti_train_sparse_select.append(sparse_select) # Embedding for test set for i, sent in enumerate(senti_test_txt): words = sent.split() words = [word.lower() for word in words] vectors = np.zeros((len(words)+2, embed_dim), dtype=np.float32) vectors[1:-1] = np.asarray([word_embedding.wordvec(word) for word in words]) indices = [blank_index] indices += [word_embedding.word2index(word) for word in words] indices += [blank_index] sparse_select = lil_matrix((len(words)+2, word_embedding.dict_size()), dtype=floatX) sparse_select[range(len(words)+2), indices] = 1.0 sparse_select = csc_matrix(sparse_select) senti_test_set.append(vectors) senti_test_word_index.append(indices) senti_test_sparse_select.append(sparse_select) end_time = time.time() logger.debug('Time used to build initial matrices: %f seconds.' % (end_time-start_time)) p_count = np.sum(senti_train_label) logger.debug('Default positive percentage in Train: %f' % (float(p_count) / train_size)) logger.debug('Default negative percentage in Train: %f' % (float(train_size-p_count) / train_size)) p_count = np.sum(senti_test_label) logger.debug('Default positive percentage in Test: %f' % (float(p_count) / test_size)) logger.debug('Default negative percentage in Test: %f' % (float(test_size-p_count) / test_size)) # Now, start training start_time = time.time() grbagger = GrCNNBagger(configer, verbose=True) end_time = time.time() logger.debug('Time used to build the model: %f seconds.' % (end_time-start_time)) learn_rate = 0.02 # Training using stochastic gradient descent algorithm epoch = 200 batch_size = 20 start_time = time.time() highest_train_accuracy, highest_test_accuracy = 0.0, 0.0 track_training_acc, track_training_cost = [], [] track_test_acc, track_test_cost = [], [] training_threshold_epoch = 30 try: sample_size = 0 fuedge_factor = 1e-6 # accumu matrix for word-embedding matrix # hist matrix for word-embedding matrix accumu_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) hist_embedding = np.zeros((word_embedding.dict_size(), configer.num_input), dtype=floatX) for i in xrange(epoch): costs = 0.0 correct_count = 0 logger.debug('=' * 50) # rate = learn_rate / (1+i) rate = learn_rate # Training num_batch = train_size / batch_size for k in xrange(num_batch): # Clear all the cache accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 for j in xrange(k*batch_size, (k+1)*batch_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Updating model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Updating word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding grbagger.update_params(accumu_grads, rate) # Clear all the cache again accumu_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] hist_grads = [np.zeros(param.get_value().shape, dtype=np.float32) for param in grbagger.params] if i > training_threshold_epoch: accumu_embedding[:] = 0.0 hist_embedding[:] = 0.0 if num_batch * batch_size < train_size: for j in xrange(num_batch*batch_size, train_size): train_sent_rep = senti_train_sparse_select[j].dot(word_embedding.embedding) results = grbagger.compute_gradient_and_cost(train_sent_rep, senti_train_label[j]) input_grad = grbagger.compute_input_gradient(train_sent_rep, senti_train_label[j]) grads, cost, pred = results[:-2], results[-2], results[-1] if pred == senti_train_label[j]: correct_count += 1 costs += cost for accumu_grad, hist_grad, grad in zip(accumu_grads, hist_grads, grads): accumu_grad += grad hist_grad += np.square(grad) ## Update the word-embedding matrix if i > training_threshold_epoch: tmp = senti_train_sparse_select[j].T.dot(input_grad) accumu_embedding += tmp hist_embedding += np.square(tmp) # Normalizing model parameters for accumu_grad, hist_grad in zip(accumu_grads, hist_grads): accumu_grad /= train_size-num_batch*batch_size accumu_grad /= fuedge_factor + np.sqrt(hist_grad) # Normalizing word-embedding matrix if i > training_threshold_epoch: accumu_embedding /= train_size-num_batch*batch_size accumu_embedding /= fuedge_factor + np.sqrt(hist_embedding) word_embedding._embedding -= rate * accumu_embedding # Updating all the parameters grbagger.update_params(accumu_grads, rate) train_accuracy = float(correct_count) / train_size logger.debug('Training epoch: %d, total cost: %f, accuracy = %f' % (i, costs, train_accuracy)) # Append all the numbers track_training_cost.append(costs) track_training_acc.append(train_accuracy) if train_accuracy > highest_train_accuracy: highest_train_accuracy = train_accuracy # Testing correct_count = 0 costs = 0.0 for j in xrange(test_size): test_sent_rep = senti_test_sparse_select[j].dot(word_embedding.embedding) pred = grbagger.predict(test_sent_rep) cost = grbagger.show_cost(test_sent_rep, senti_test_label[j]) if pred == senti_test_label[j]: correct_count += 1 costs += cost test_accuracy = float(correct_count) / test_size logger.debug('Test accuracy: %f' % test_accuracy) # Append all the numbers track_test_cost.append(costs) track_test_acc.append(test_accuracy) if test_accuracy > highest_test_accuracy: highest_test_accuracy = test_accuracy # Sampling to show the weights and experts of training and test instances logger.debug('Training Sampling: ') for j in xrange(sample_size): idx = np.random.randint(train_size) weights = grbagger.show_weights(senti_train_set[idx]) scores = grbagger.show_scores(senti_train_set[idx]) prob = grbagger.show_prob(senti_train_set[idx]) label = senti_train_label[idx] logger.debug('Training idx: {}'.format(idx)) logger.debug('Training scores: {}'.format(scores)) logger.debug('Training weights: {}'.format(weights)) logger.debug('Training probability: {}'.format(prob)) logger.debug('Training label: {}'.format(label)) logger.debug('-' * 50) logger.debug('Test Sampling: ') for j in xrange(sample_size): idx = np.random.randint(test_size) weights = grbagger.show_weights(senti_test_set[idx]) scores = grbagger.show_scores(senti_test_set[idx]) prob = grbagger.show_prob(senti_test_set[idx]) label = senti_test_label[idx] logger.debug('Test idx: {}'.format(idx)) logger.debug('Test scores: {}'.format(scores)) logger.debug('Test weights: {}'.format(weights)) logger.debug('Test probability: {}'.format(prob)) logger.debug('Test label: {}'.format(label)) logger.debug('-' * 50) # Check norms of the model parameter for param in grbagger.params: val = param.get_value(borrow=True) norm = np.sqrt(np.sum(np.square(val))) logger.debug('Parameter: {}, L2-norm: {}'.format(param.name, norm)) wnorm = np.sqrt(np.sum(np.square(word_embedding._embedding))) logger.debug('Parameter: {}, L2-norm: {}'.format('Word-Embedding', wnorm)) except: logger.debug('Error appeared!') traceback.print_exc(file=sys.stdout) logger.debug('-' * 50) finally: end_time = time.time() logger.debug('Time used for training: %f seconds.' % (end_time-start_time)) logger.debug('Highest training accuracy: %f' % highest_train_accuracy) logger.debug('Highest test accuracy: %f' % highest_test_accuracy) GrCNNBagger.save('fine-grbagger.model', grbagger) # Save all the tracking numbers track_training_acc = np.asarray(track_training_acc) track_training_cost = np.asarray(track_training_cost) track_test_acc = np.asarray(track_test_acc) track_test_cost = np.asarray(track_test_cost) with file('fine-senti-records.npy', 'w') as fout: np.save(fout, track_training_acc) np.save(fout, track_training_cost) np.save(fout, track_test_acc) np.save(fout, track_test_cost) logger.debug('Training and test records saved to fine-senti-records.npy...') logger.debug('Finished...')