class Trainer: def __init__(self, is_many_to_one=True, max_epoch=5000, batch_size=10, learning_rate=.01, hidden_size=128, num_hidden_layer=1, drop_rate=0., embedding_len=100, use_tensorboard=False, early_stopping_history_len=7, early_stopping_allowance=3, verbose=1, save_best_model=False, use_cuda=False, data_file_count=-1, identity=None, early_stopping=False, pre_train=None): self.logger = Logger(verbose_level=verbose) self.is_many_to_one = is_many_to_one self.max_epoch = max_epoch self.batch_size = batch_size self.learning_rate = learning_rate self.hidden_size = hidden_size self.num_hidden_layer = num_hidden_layer self.drop_rate = drop_rate self.embedding_len = embedding_len self.use_cuda = use_cuda self.use_tensorboard = use_tensorboard self.early_stopping_history_len = early_stopping_history_len self.early_stopping_allowance = early_stopping_allowance self.verbose = verbose self.save_best_model = save_best_model self.data_file_count = data_file_count self.identity = identity self.early_stopping = early_stopping self.pre_train = pre_train def train(self): data_manager = DataManager(self.batch_size, logger=self.logger, is_many_to_one=self.is_many_to_one, data_file_count=self.data_file_count, pretrained_file=self.pre_train) if self.is_many_to_one: net = RNN_M2O(len(data_manager.word_list), self.embedding_len, self.hidden_size, self.learning_rate, self.num_hidden_layer, self.drop_rate, use_adam=True, use_cuda=self.use_cuda, pretrained_emb=data_manager.pretrained_embeddings()) else: net = RNN_M2M(len(data_manager.word_list), self.embedding_len, self.hidden_size, self.learning_rate, self.num_hidden_layer, self.drop_rate, use_adam=True, use_cuda=self.use_cuda, pretrained_emb=data_manager.pretrained_embeddings()) self._train(net, data_manager) def _train(self, net, data_manager): if self.identity is None: identity = 'M2O' if self.is_many_to_one else 'M2M' identity += '_'+str(self.learning_rate).replace('.', '') identity += '_'+str(self.hidden_size) identity += '_'+str(self.num_hidden_layer) else: identity = self.identity if self.use_tensorboard: from tensorboardX import SummaryWriter if os.path.exists(identity+'_logs'): if self.verbose > 0: should_rm = input(' - Log dir exists. Remove (Y/n)?') if should_rm.lower() == 'y' or should_rm == '': shutil.rmtree(identity+'_logs') self.writer = SummaryWriter(identity+'_logs') train_data_loader = data_manager.train_loader() valid_data_loader = data_manager.valid_loader() optimizer = net.get_optimizer() loss_fn = net.get_loss() self.logger.i('Start training %s...'%(identity), True) try: total_batch_per_epoch = len(train_data_loader) perplexity_history = deque(maxlen=self.early_stopping_history_len) min_perplexity = 999. early_stopping_violate_counter = 0 status, _epoch_index, _perplexity_history, _min_perplexity = self._load(net, identity) if status: perplexity_history = _perplexity_history min_perplexity = _min_perplexity else: _epoch_index = 0 epoch_index = 0 for epoch_index in range(_epoch_index, self.max_epoch): losses = 0. acc = 0. counter = 0 self.logger.i('[ %d / %d ] epoch:'%(epoch_index + 1, self.max_epoch), True) # Training net.train() for batch_index, (data, label) in enumerate(train_data_loader): data = T.autograd.Variable(data) label = T.autograd.Variable(label) if self.use_cuda: data = data.cuda() label = label.cuda() output, predicted = net(data) acc += (label.squeeze() == predicted).float().mean().data * data.size(0) loss = loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1)) optimizer.zero_grad() loss.backward() T.nn.utils.clip_grad_norm(net.parameters(), .25) optimizer.step() losses += loss.data.cpu()[0] * data.size(0) counter += data.size(0) progress = min((batch_index + 1) / total_batch_per_epoch * 20., 20.) self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f'% ('>'*int(progress)+'-'*(20-int(progress)), progress * 5., losses / counter, acc / counter)) mean_loss = losses / counter valid_losses = 0. valid_counter = 0 valid_acc = 0. # Validtion net.eval() for data, label in valid_data_loader: data = T.autograd.Variable(T.LongTensor(data)) label = T.autograd.Variable(T.LongTensor(label)) if self.use_cuda: data = data.cuda() label = label.cuda() output, predicted = net(data) valid_losses += loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1)) \ .data.cpu()[0] * data.size(0) valid_acc += (label.squeeze() == predicted).float().mean().data * data.size(0) valid_counter += data.size(0) mean_val_loss = valid_losses/valid_counter mean_val_acc = valid_acc/valid_counter perplexity = np.exp(mean_val_loss) self.logger.d(' -- val_loss: %.4f, val_acc: %.4f, perplexity: %.4f'% (mean_val_loss, mean_val_acc, perplexity), reset_cursor=False) # Log with tensorboard if self.use_tensorboard: self.writer.add_scalar('train_loss', mean_loss, epoch_index) self.writer.add_scalar('train_acc', acc / counter, epoch_index) self.writer.add_scalar('val_loss', mean_val_loss, epoch_index) self.writer.add_scalar('val_acc', mean_val_acc, epoch_index) self.writer.add_scalar('val_perp', perplexity, epoch_index) # Early stopping if self.early_stopping and perplexity > np.mean(perplexity_history): early_stopping_violate_counter += 1 if early_stopping_violate_counter >= self.early_stopping_allowance: self.logger.i('Early stopping...', True) break else: early_stopping_violate_counter = 0 # Save best model if self.save_best_model and perplexity < min_perplexity: self._save(epoch_index, net, perplexity_history, perplexity, identity) min_perplexity = perplexity perplexity_history.append(perplexity) self.logger.d('', True, False) except KeyboardInterrupt: self.logger.i('\n\nInterrupted', True) if self.use_tensorboard: self.writer.close() self.logger.i('Finish', True) return np.mean(perplexity_history) def test(self, id): _, lr, hs, nh = re.search(r'M2(M|O)_([0-9]+)_([0-9]+)_([0-9]+)_?', id).groups() lr, hs, nh = float('0.'+lr[1:]), int(hs), int(nh) data_manager = DataManager(self.batch_size, logger=self.logger, is_many_to_one=self.is_many_to_one, data_file_count=self.data_file_count, pretrained_file=self.pre_train, is_test=True) if self.is_many_to_one: model = RNN_M2O else: model = RNN_M2M net = model(len(data_manager.word_list), self.embedding_len, hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda, pretrained_emb=data_manager.pretrained_embeddings()) status, _epoch_index, _perplexity_history, _min_perplexity = self._load(net, id) if status: loss_fn = net.get_loss() # Testing test_losses = 0. test_acc = 0. test_counter = 0 net.eval() for data, label in data_manager.test_loader(): data = T.autograd.Variable(T.LongTensor(data)) label = T.autograd.Variable(T.LongTensor(label)) if self.use_cuda: data = data.cuda() label = label.cuda() output, predicted = net(data) test_losses += loss_fn(output.view(-1, len(data_manager.word_list)), label.view(-1)) \ .data.cpu()[0] * data.size(0) test_acc += (label.squeeze() == predicted).float().mean().data * data.size(0) test_counter += data.size(0) mean_test_loss = test_losses/test_counter mean_test_acc = test_acc/test_counter perplexity = np.exp(mean_test_loss) self.logger.i('Loss: %.4f, Acc: %.4f, Perp: %.4f'%(mean_test_loss, mean_test_acc, perplexity)) return mean_test_loss, mean_test_acc, perplexity else: raise AssertionError('Model file not found!') def text_generate(self, given_words, id, max_len=150): if os.path.exists('data/word_list'): word_list = pickle.load(open('data/word_list', 'rb')) else: raise AssertionError('word_list not found') _, lr, hs, nh = re.search(r'M2(M|O)_([0-9]+)_([0-9]+)_([0-9]+)_?', id).groups() lr, hs, nh = float('0.'+lr[1:]), int(hs), int(nh) if self.is_many_to_one: net = RNN_M2O(len(word_list), self.embedding_len, hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda) else: net = RNN_M2M(len(word_list), self.embedding_len, hs, lr, nh, self.drop_rate, use_adam=True, use_cuda=self.use_cuda) status, _, _, _ = self._load(net, id) if status: word_index_dict = {w: i for i, w in enumerate(word_list)} given_words = given_words.lower().strip().split() given_words = [1]+[word_index_dict[word] if word in word_index_dict else 2 for word in given_words] state = None for i in range(max_len): if i < len(given_words): cur_var = T.autograd.Variable(T.LongTensor([[given_words[i]]])) if self.use_cuda: cur_var = cur_var.cuda() _, predicted, state = net(cur_var, state, return_states=True) if i >= len(given_words)-1: if predicted[0].cpu().data[0] > 0: given_words.append(predicted[0].cpu().data[0]) else: break print('Text generated: %s'%(' '.join([word_list[word] for word in given_words[1:]]))) print('Finished') else: raise AssertionError('Save not found!') def _save(self, global_step, net, perplexity_history, min_perplexity, identity): T.save({ 'epoch': global_step+1, 'state_dict': net.state_dict(), 'perplexity_history': perplexity_history, 'min_perplexity': min_perplexity, 'optimizer': net.optimizer.state_dict() }, identity+'_best') def _load(self, net, identity): if os.path.exists(identity+'_best'): checkpoint = T.load(identity+'_best') elif os.path.exists(identity): checkpoint = T.load(identity) else: return False, None, None, None net.load_state_dict(checkpoint['state_dict']) net.get_optimizer().load_state_dict(checkpoint['optimizer']) return True, checkpoint['epoch'], checkpoint['perplexity_history'], \ checkpoint['min_perplexity']
class Trainer: def __init__(self, model_generator, train_dataset, valid_dataset, test_dataset, batch_size=50, max_epoch=1000, use_cuda=True, use_tensorboard=False, early_stopping_history_len=50, early_stopping_patience=5, collate_fn=None, verbose=1, save_best_model=False): self.logger = Logger(verbose_level=verbose) self.model_generator = model_generator self.train_dataset = train_dataset self.valid_dataset = valid_dataset self.test_dataset = test_dataset self.batch_size = batch_size self.max_epoch = max_epoch self.use_cuda = use_cuda self.use_tensorboard = use_tensorboard self.early_stopping_history_len = early_stopping_history_len self.early_stopping_patience = early_stopping_patience self.collate_fn = collate_fn self.save_best_model = save_best_model self.counter = 0 def train(self): emotions = self.train_dataset.EMOTIONS best_valid_corrcoef = {} best_test_corrcoef = {} for emotion in emotions: self.train_dataset.set_emotion(emotion) self.valid_dataset.set_emotion(emotion) self.test_dataset.set_emotion(emotion) train_loader = T.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True) valid_loader = T.utils.data.DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=True) test_loader = T.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=True) model = self.model_generator(self.train_dataset.wordict_size, self.train_dataset.weight) best_valid_corrcoef[emotion], \ best_test_corrcoef[emotion] = self._train(model, train_loader, valid_loader, test_loader, identity=emotion) del model, train_loader, valid_loader best_valid_corrcoef['avg'] = np.mean( [best_valid_corrcoef[emotion] for emotion in emotions]) best_test_corrcoef['avg'] = np.mean( [best_test_corrcoef[emotion] for emotion in emotions]) # self.logger.i('\n'+str(best_valid_corrcoef), True, True) # self.logger.i('\n'+str(best_test_corrcoef), True, True) return best_valid_corrcoef, best_test_corrcoef def _train(self, model, train_loader, valid_loader, test_loader, identity=None): if identity is None: identity = 'Net' + str(self.counter) self.counter += 1 if self.use_tensorboard: from tensorboardX import SummaryWriter self.writer = SummaryWriter(identity + '_logs') self.logger.i('Start training %s...' % (identity), True) try: total_batch_per_epoch = len(train_loader) loss_history = deque(maxlen=self.early_stopping_history_len) best_corrcoef = -1. last_test_corrcoef = -1. # early_stopping_violate_counter = 0 epoch_index = 0 for epoch_index in range(self.max_epoch): losses = 0. # acc = 0. counter = 0 self.logger.i( '[ %d / %d ] epoch:' % (epoch_index + 1, self.max_epoch), True) # Training model.train() for batch_index, entry in enumerate(train_loader): if self.collate_fn is not None: data, label = self.collate_fn(entry) else: data, label = entry data = T.autograd.Variable(data) label = T.autograd.Variable(label) if self.use_cuda: data = data.cuda() label = label.cuda() output, predicted = model(data) # acc += (label.squeeze() == predicted).float().mean().data * data.size(0) loss = model.loss_fn(output, label.view(-1)) model.optimizer.zero_grad() loss.backward() T.nn.utils.clip_grad_norm(model.parameters(), .25) model.optimizer.step() losses += loss.data.cpu()[0] * data.size(0) counter += data.size(0) progress = min( (batch_index + 1) / total_batch_per_epoch * 20., 20.) self.logger.d('[%s] (%3.f%%) loss: %.4f, ' % ('>' * int(progress) + '-' * (20 - int(progress)), progress * 5., losses / counter)) mean_loss = losses / counter valid_losses = 0. valid_counter = 0 # valid_acc = 0. # Validtion model.eval() valid_prediction = [] valid_labels = [] for entry in valid_loader: if self.collate_fn is not None: data, label = self.collate_fn(entry) else: data, label = entry valid_labels += list(label.view(-1)) data = T.autograd.Variable(data) label = T.autograd.Variable(label) if self.use_cuda: data = data.cuda() label = label.cuda() output, predicted = model(data) valid_losses += model.loss_fn( output, label.view(-1)).data.cpu()[0] * data.size(0) valid_prediction += list(predicted.view(-1).data.tolist()) # valid_acc += (label.squeeze() == predicted).float().mean().data * data.size(0) valid_counter += data.size(0) mean_val_loss = valid_losses / valid_counter # mean_val_acc = valid_acc/valid_counter corrcoef = np.corrcoef(valid_prediction, valid_labels)[0, 1] self.logger.d(' -- val_loss: %.4f, corrcoef: %.4f' % (mean_val_loss, corrcoef), reset_cursor=False) # Log with tensorboard if self.use_tensorboard: self.writer.add_scalar('train_loss', mean_loss, epoch_index) # self.writer.add_scalar('train_acc', acc / counter, epoch_index) self.writer.add_scalar('val_loss', mean_val_loss, epoch_index) # self.writer.add_scalar('val_acc', mean_val_acc, epoch_index) self.writer.add_scalar('val_corrcoef', corrcoef, epoch_index) loss_history.append(mean_val_loss) # # Early stopping # if mean_val_loss > np.mean(loss_history): # early_stopping_violate_counter += 1 # if early_stopping_violate_counter >= self.early_stopping_patience: # self.logger.i('Early stopping...', True) # break # else: # early_stopping_violate_counter = 0 # Save best model if corrcoef > best_corrcoef: best_corrcoef = corrcoef # last_test_corrcoef = self._test(model, test_loader) # self.logger.d(' -- test_corrcoef: %.4f'%(last_test_corrcoef), # reset_cursor=False) if self.save_best_model: self._save(model, epoch_index, loss_history, best_corrcoef, identity) self.logger.d('', True, False) except KeyboardInterrupt: self.logger.i('\n\nInterrupted', True) if self.use_tensorboard: self.writer.close() self.logger.i('Finish', True) return best_corrcoef, last_test_corrcoef def _test(self, model, test_loader): model.eval() test_prediction = [] test_labels = [] for entry in test_loader: if self.collate_fn is not None: data, label = self.collate_fn(entry) else: data, label = entry test_labels += list(label.view(-1)) data = T.autograd.Variable(data) label = T.autograd.Variable(label) if self.use_cuda: data = data.cuda() label = label.cuda() _, predicted = model(data) test_prediction += list(predicted.view(-1).data.tolist()) return np.corrcoef(test_prediction, test_labels)[0, 1] def _save(self, model, global_step, loss_history, best_corrcoef, identity): T.save( { 'epoch': global_step + 1, 'state_dict': model.state_dict(), 'loss_history': loss_history, 'best_corrcoef': best_corrcoef, 'optimizer': model.optimizer.state_dict() }, identity + '_best')
class Data(Dataset): ''' Data loading ''' def __init__(self, filename='twitter_sentiment.csv.gz', window_size=2, for_embedding=False, logger=None, wordlist_file=None): self.window_size = window_size self.for_embedding = for_embedding if logger is None: self.logger = Logger(1) else: self.logger = logger self.logger.i('Initializing Loader....') x = [] # Input self.x = [] self.y_ = [] # Ground truth self.context_vec = [] self.target_word = [] self.max_sentence_len = 0 word_set = set() word_counter = Counter() # Read labelled data from file with gzip.open(filename, 'rt') as dfile: lines = dfile.readlines()[1:] num_line = len(lines) for index, line in enumerate(lines): _, sentiment, sentence = line.split('\t') words = self._clean_str(sentence).split() self.max_sentence_len = np.max( [self.max_sentence_len, len(words)]) word_set = word_set.union(words) word_counter += Counter(words) x.append(words) self.y_.append( T.LongTensor([1 if sentiment == 'pos' else 0])) self.logger.d('Loader: Read %6d / %6d line' % (index + 1, num_line)) # Build word dictionary filter_words = [ key for key, count in dict(word_counter).items() if count > 3 ] self.word_dict = { word: index + 1 for index, word in enumerate(filter_words) } # self.word_dict = {word: index+1 for index, word in enumerate(dict(word_counter))} self.word_dict['<unk>'] = 0 self.word_counter = word_counter self.word_count = len(self.word_dict) if for_embedding: for word_seq in x: words, target = self._to_context_vec(word_seq) self.context_vec.extend(words) self.target_word.extend(target) else: for word_seq in x: self.x.append([self._to_index(word) for word in word_seq]) del x self.len = len(self.x) if wordlist_file is not None: with open(wordlist_file, 'w+') as wlfile: for key, _ in sorted(self.word_dict.items(), key=lambda x: x[1]): wlfile.write(key + '\n') self.logger.i('Loader initialized', True) self.logger.i('Word Count: %d' % (self.word_count), True) self.logger.i( 'Number of unknown word: %d' % (len(self.word_counter) - len(self.word_dict) + 1), True) def _to_index(self, word): if word in self.word_dict.keys(): return self.word_dict[word] else: return self.word_dict['<unk>'] def _to_word(self, index): return self.word_dict.keys()[self.word_dict.values().index(index)] def __getitem__(self, index): if self.for_embedding: return self.context_vec[index], self.target_word[index] else: return self.x[index], self.y_[index] def _get_max_sentence_len(self): return self.max_sentence_len def __len__(self): if self.for_embedding: return len(self.context_vec) else: return self.len def _clean_str(self, string): ''' Remove noise from input string ''' string = re.sub(r'&[a-zA-Z];', ' ', string) string = re.sub(r'[^A-Za-z0-9,!?\(\)\.\'\`]', ' ', string) string = re.sub(r'[0-9]+', ' <num> ', string) string = re.sub(r'( \' ?)|( ?\' )', ' ', string) string = re.sub(r'(\'s|\'ve|n\'t|\'re|\'d|\'ll|\.|,|!|\?|\(|\))', r' \1 ', string) string = re.sub(r'\s{2,}', ' ', string) return string.strip().lower() def _to_context_vec(self, word_seq): ''' Convert sentence to context vectors ''' input_words = [] target_word = [] buffer_len = self.window_size * 2 + 1 window = deque(maxlen=buffer_len) for word in word_seq: window.append(word) if len(window) == buffer_len: tmp_window = [ self._to_index(w) for w in list(window.copy()) ] target = tmp_window[self.window_size] del tmp_window[self.window_size] input_words.append(T.LongTensor(tmp_window)) target_word.append(T.LongTensor([target])) return input_words, target_word def _get_nce_weight(self): ''' Get weight for generating noise ''' power = .75 dominator = sum(np.power(list(self.word_counter.values()), power)) freq_vec = [0.] for word, count in self.word_counter.items(): if word in self.word_dict.keys(): freq_vec.append(math.pow(count, power) / dominator) freq_vec[0] = np.mean(freq_vec) exp_x = np.exp(freq_vec - np.max(freq_vec)) return exp_x / exp_x.sum()
class CBOW(T.nn.Module): def __init__(self, embedding_len, lr=1., momentum=.9, batch_size=50, window_size=2, epoch=1, use_cuda=False, embedding_path=None, verbose=1, tensorboard=False, wordlist_path=None, log_folder='runs', use_nce=False): super(CBOW, self).__init__() self.embedding_len = embedding_len self.lr = lr self.momentum = momentum self.epoch = epoch self.use_cuda = use_cuda self.embedding_path = embedding_path self.logger = Logger(verbose) self.tensorboard = tensorboard self.use_nce = use_nce if self.tensorboard: from tensorboardX import SummaryWriter self.writer = SummaryWriter(log_folder) self.loader = Loader(for_embedding=True, window_size=window_size, batch_size=batch_size, logger=self.logger, wordlist_file=wordlist_path) self.vocab_size = self.loader.get_vocab_size() self._build_model() def __del__(self): if self.tensorboard: self.writer.close() def _build_model(self): def init_weight(m): m.weight.data.normal_().mul_(T.FloatTensor([2/m.weight.data.size()[0]]).sqrt_()) self.embeddings = T.nn.Embedding(self.vocab_size, self.embedding_len) if self.embedding_path is None: self.embeddings.apply(init_weight) elif Path.exists(self.embedding_path): self.embeddings.weight.data.copy_(T.from_numpy(np.loadtxt(self.embedding_path))) if self.use_nce: self.loss_fn = NCELoss(self.vocab_size, self.embedding_len, self.use_cuda, self.loader.get_nce_weight()) else: self.fc = T.nn.Linear(self.embedding_len, self.vocab_size) self.fc.apply(init_weight) self.loss_fn = T.nn.CrossEntropyLoss() if self.use_cuda: self.cuda() if self.momentum > 0.: self.optimizer = T.optim.SGD(self.parameters(), lr=self.lr, momentum=self.momentum, nesterov=True) else: self.optimizer = T.optim.SGD(self.parameters(), lr=self.lr, momentum=0., nesterov=False) # self.optimizer = T.optim.Adam(self.parameters(), lr=.01) def forward(self, inputs): embeddings = self.embeddings(inputs) if self.use_nce: return embeddings else: sum_vector = embeddings.mean(dim=1) output = self.fc(sum_vector) # output = T.nn.functional.softmax(output, dim=1) _, max_indice = T.max(output, dim=1) return output, max_indice def fit(self): self.logger.i('Start training network...', True) try: total_batch_per_epoch = len(self.loader) loss_history = deque(maxlen=50) epoch_index = 0 for epoch_index in range(self.epoch): losses = 0. acc = 0. counter = 0 self.logger.i('[ %d / %d ] epoch:'%(epoch_index + 1, self.epoch), True) for batch_index, (context, target) in enumerate(self.loader): context = T.autograd.Variable(context) target = T.autograd.Variable(target) if self.use_cuda: context, target = context.cuda(), target.cuda() if self.use_nce: output = self(context) acc = math.nan loss = self.loss_fn(output, target, 5) else: output, predicted = self(context) acc += (target.squeeze() == predicted).float().mean().data loss = self.loss_fn(output, target.view(-1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() losses += loss.data.cpu()[0] counter += 1 progress = min((batch_index + 1) / total_batch_per_epoch * 20., 20.) self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f'% ('>'*int(progress)+'-'*(20-int(progress)), progress * 5., losses / counter, acc / counter)) mean_loss = losses / counter if self.tensorboard: self.writer.add_scalar('train_loss', mean_loss, epoch_index) self.writer.add_scalar('train_acc', acc / counter, epoch_index) loss_history.append(mean_loss) if mean_loss > np.mean(loss_history): self.logger.i('Early stopping...', True) break self.logger.d('', True, False) except KeyboardInterrupt: self.logger.i('\n\nInterrupted', True) self.logger.i('Saving word embeddings...') self._save_embeddings(epoch_index+1) self.logger.i('Word embeddings saved', True) self.logger.i('Finish', True) def _save_embeddings(self, global_step=0): embeds = self.embeddings.weight.data.cpu().numpy() np.savetxt(self.embedding_path, embeds) if self.tensorboard: self.writer.add_embedding(self.embeddings.weight.data, [key for key, value in \ sorted(self.loader.dataset.word_dict.items(), key=lambda x: x[1])], global_step=global_step) def get_word_embedding(self, word): return self.embeddings.weight.data[self.loader.to_index(word)] def get_similarity(self, w1, w2): w1, w2 = self.get_word_embedding(w1), self.get_word_embedding(w2) w1, w2 = T.nn.functional.normalize(w1, dim=0), T.nn.functional.normalize(w2, dim=0) return (w1 * w2).sum(dim=0)
def __init__(self, raw_data, train_valid_ratio=.2, do_cleaning=True, **args): if '_empty' not in args or not args['_empty']: self.label = [] self.data = [] self.valid_data = [] self.valid_label = [] sentiments = set() self.word_counter = Counter() self.max_len = 0 Log = Logger() Log.i('Start loading dataset...') # Load lists if saved previously has_lists = Path.exists('sentiment_list') and Path.exists( 'word_list') if has_lists: Log.d('Sentiment and word list found!') with open('sentiment_list', 'r') as sf: self.sentiments = sf.read().strip().split('\n') tmp_dict = {} with open('word_list', 'r') as wf: for line in wf.readlines(): word, freq = line.strip().split() tmp_dict[word] = int(freq) self.word_counter = Counter(tmp_dict) self.word_list = ['<pad>', '<unk>'] + \ [key for key, value in self.word_counter.items() if value >= 3] del tmp_dict if len(self.sentiments) == 0 or len(self.word_list) == 0: raise AssertionError('either sentiment and word list is empty') \ .with_traceback(sys.exc_info()[2]) self.sentiments = { word: index for index, word in enumerate(self.sentiments) } self.word_list = { word: index for index, word in enumerate(self.word_list) } if isinstance(raw_data, str): raw_data = raw_data.strip().split('\n')[1:] raw_data, valid_raw_data = train_test_split( raw_data, test_size=train_valid_ratio, random_state=0) data_len = len(raw_data) valid_data_len = len(valid_raw_data) # Add data and label to array for index, line in enumerate(raw_data): cols = line.split(',', 3) if do_cleaning: words = _clean_str(cols[3].strip('"')).split() else: words = cols[3].strip('"').split() self.max_len = max(self.max_len, len(words)) # Tweet_id and authour ignore? if not has_lists: sentiments.add(cols[1]) self.label.append([cols[1]]) self.word_counter += Counter(words) self.data.append(words) else: self.label.append([self.sentiments[cols[1]]]) self.data.append([ self.word_list[word] if word in self.word_list else self.word_list['<unk>'] for word in words ]) Log.i('Loading %6d / %6d' % (index, data_len + valid_data_len)) for index, line in enumerate(valid_raw_data): cols = line.split(',', 3) if do_cleaning: words = _clean_str(cols[3].strip('"')).split() else: words = cols[3].strip('"').split() self.max_len = max(self.max_len, len(words)) # Tweet_id and authour ignore? if not has_lists: self.valid_label.append([cols[1]]) self.valid_data.append(words) else: self.valid_label.append([self.sentiments[cols[1]]]) self.valid_data.append([ self.word_list[word] if word in self.word_list else self.word_list['<unk>'] for word in words ]) Log.i('Loading %6d / %6d' % (index + data_len, data_len + valid_data_len)) Log.i('Finish loading', True) Log.i('Start preprocessing...') if not has_lists: # Denoise by setting minimum freq self.word_list = ['<pad>', '<unk>'] + \ [key for key, value in self.word_counter.items() if value >= 3] # Save sentiment and word list self.sentiments = list(sentiments) if len(self.sentiments) > 0 and len(self.word_list) > 0: with open('sentiment_list', 'w+') as sf: for sentiment in self.sentiments: sf.write(sentiment + '\n') with open('word_list', 'w+') as wf: for word, freq in dict(self.word_counter).items(): wf.write(word + ' ' + str(freq) + '\n') else: raise AssertionError('either sentiment and word list is empty') \ .with_traceback(sys.exc_info()[2]) # Convert to dict for fast searching self.sentiments = { word: index for index, word in enumerate(self.sentiments) } self.word_list = { word: index for index, word in enumerate(self.word_list) } # Convert text to index for index, [data_ent, label_ent] in enumerate(zip(self.data, self.label)): # <unk> (index 0) if word not found self.data[index] = [ self.word_list[word] if word in self.word_list else self.word_list['<unk>'] for word in data_ent ] self.label[index] = [ self.sentiments[word] for word in label_ent ] # Convert text to index for index, [data_ent, label_ent] in enumerate( zip(self.valid_data, self.valid_label)): # <unk> (index 0) if word not found self.valid_data[index] = [ self.word_list[word] if word in self.word_list else self.word_list['<unk>'] for word in data_ent ] self.valid_label[index] = [ self.sentiments[word] for word in label_ent ] data_len_list = [len(line) for line in self.data] self.data_len_mean = np.mean(data_len_list) self.data_len_std = np.std(data_len_list) self.data = [ entry + [0] * (self.max_len - len(entry)) for entry in self.data ] self.valid_data = [ entry + [0] * (self.max_len - len(entry)) for entry in self.valid_data ] Log.i('Finish preprocessing', True)
class DataManager: def __init__(self, batch_size=50, max_seq=7, logger=None, is_many_to_one=False, train_valid_ratio=.2, is_test=False, data_split_mode='window', data_file_count=-1, pretrained_file=None): self.batch_size = batch_size self.max_seq = max_seq if logger is None: self.logger = Logger(0) else: self.logger = logger self.is_test = is_test self.data_split_mode = data_split_mode if self.data_split_mode not in ['window', 'sentence']: raise AssertionError('unknown split mode') self.data_file_count = data_file_count self.pretrained_file = pretrained_file # Reserve for <sos> self.max_seq += 1 # mkdir data folder if not os.path.exists('data'): os.mkdir('data') # File path self.file_path_subfix = '_M2O' if is_many_to_one else '_M2M' self.file_path_prefix = 'Data/Test' if self.is_test else 'Data/Train' self.data = '' self.dataset = None self.word_list = None self.train_dataset, self.valid_dataset = None, None self.word_counter = None self.tensors = None if self.pretrained_file is not None: self.tensors = T.Tensor(self._load_from_pretrain()) self.word_index_dict = {w: i for i, w in enumerate(self.word_list)} is_wordlist_loaded = True else: is_wordlist_loaded = self._load_wordlist() if self.is_test: if not self._load_dataset(): self._read_files() self.dataset = Dataset(self.data, self.word_index_dict, is_many_to_one, self.max_seq - 1) pickle.dump( self.dataset, open('data/test_data' + self.file_path_subfix, 'wb+')) else: if not self._load_dataset(): # Load previous split data status, train_data, valid_data = self._load_data() if not status: # No split data found self._read_files() train_data, valid_data = train_test_split( self.data, test_size=train_valid_ratio, random_state=0) pickle.dump(train_data, open('data/train_data', 'wb+')) pickle.dump(valid_data, open('data/valid_data', 'wb+')) if (not is_wordlist_loaded or not os.path.exists('data/word_counter')) \ and self.pretrained_file is None: # Generate word list self.logger.i( 'Start counting words because word list or word counter not found...' ) self.word_counter = Counter() flatten_train_data = [ x for sublist in train_data for x in sublist ] self.word_counter += Counter(flatten_train_data) # Only keep words in training data del self.word_counter['<sos>'] # Set min freq # filtered_word_list = [k for k, v in self.word_counter.items() if v >= 3] # self.word_list = ['<pad>', '<sos>', '<unk>']+filtered_word_list self.word_list = ['<pad>', '<sos>', '<unk>'] + list( self.word_counter.keys()) self.word_index_dict = { w: i for i, w in enumerate(self.word_list) } # Save word list pickle.dump(self.word_list, open('data/word_list', 'wb+')) # Count words for statistics flatten_valid_data = [ x for sublist in valid_data for x in sublist ] self.word_counter += Counter(flatten_valid_data) # Update unknown words for statistics self.word_counter += Counter({'<unk>': 0}) self.logger.i('Getting unknown word list...') unk_word_list = list( filter(lambda p: p[0] not in self.word_list, self.word_counter.items())) self.logger.i( 'Start deleting words in validation set but not in training set...' ) unk_word_list_len = len(unk_word_list) for index, [k, v] in enumerate(unk_word_list): del self.word_counter[k] self.word_counter['<unk>'] += v self.logger.i('Deleting... %5d / %5d' % (index + 1, unk_word_list_len)) del self.word_counter['<sos>'] # Save word counter pickle.dump(self.word_counter, open('data/word_counter', 'wb+')) self.logger.i('Finish building word list and word counter') if self.train_dataset is None and self.valid_dataset is None: # Save training and validation dataset self.train_dataset = Dataset(train_data, self.word_index_dict, is_many_to_one, self.max_seq - 1) self.valid_dataset = Dataset(valid_data, self.word_index_dict, is_many_to_one, self.max_seq - 1) pickle.dump( self.train_dataset, open('data/train_data' + self.file_path_subfix, 'wb+')) pickle.dump( self.valid_dataset, open('data/valid_data' + self.file_path_subfix, 'wb+')) self.logger.i('Finish Generating training set and validation set') def _load_dataset(self): if self.is_test and os.path.exists('data/test_data' + self.file_path_subfix): self.dataset = pickle.load( open('data/test_data' + self.file_path_subfix, 'rb')) elif not self.is_test and os.path.exists('data/train_data'+self.file_path_subfix) \ and os.path.exists('data/valid_data'+self.file_path_subfix): self.train_dataset = pickle.load( open('data/train_data' + self.file_path_subfix, 'rb')) self.valid_dataset = pickle.load( open('data/valid_data' + self.file_path_subfix, 'rb')) else: return False self.logger.i('Dataset found!') return True def _load_wordlist(self): if os.path.exists('data/word_list'): self.logger.i('Word list found!') self.word_list = pickle.load(open('data/word_list', 'rb')) self.word_index_dict = {w: i for i, w in enumerate(self.word_list)} elif self.is_test: raise AssertionError('word_list not found') else: return False return True def _load_data(self): if os.path.exists('data/train_data') and os.path.exists( 'data/valid_data'): train_data = pickle.load(open('data/train_data', 'rb')) valid_data = pickle.load(open('data/valid_data', 'rb')) self.logger.i('Training dataset and validation dataset found!') return True, train_data, valid_data return False, None, None def _read_files(self): if os.path.exists('hw4_dataset.zip'): with ZipFile('hw4_dataset.zip', 'r') as zf: if self.data_file_count < 0: file_count = len(zf.filelist) else: file_count = self.data_file_count self.logger.i('Start loading dataset...') valid_file_counter = 0 file_list = [] for f in zf.filelist: if f.file_size > 0: if f.filename.startswith(self.file_path_prefix): text = zf.read(f.filename).decode('utf-8').lower() text = text[text.rindex('*end*') + len('*end*'):text.rindex('end')] self.data += clean_str(text) + ' \n ' valid_file_counter += 1 file_list.append(f.filename) self.logger.i('Loading %3d docs' % (valid_file_counter)) if valid_file_counter >= file_count: break with open('files_used', 'w+') as fu: for file_name in file_list: fu.write(file_name + '\n') if self.data_split_mode == 'window': tmp_data = self.data self.data = [] window = deque(maxlen=self.max_seq) window.append('<sos>') for word in tmp_data.strip().split(' '): if word == '\n': word = '<sos>' window.append(word) if len(window) == self.max_seq: self.data.append(window.copy()) else: self.data = [['<sos>'] + entry.split(' ') for entry in self.data.split('\n')] # Limit sentense len def spliter(d): for i in range(math.ceil(len(d) / self.max_seq)): yield d[self.max_seq * i:self.max_seq * (i + 1)] for index, entry in enumerate(self.data): if len(entry) > self.max_seq: splits = list(spliter(entry)) self.data[index] = splits[0] self.data.extend(splits[1:]) self.data = list(filter(lambda x: len(x) > 2, self.data)) else: raise AssertionError('hw4_dataset.zip not found') def _load_from_pretrain(self): self.logger.i('Loading pre-trained embeddings...') if not (os.path.exists('data/pre_trained_word_list') \ and os.path.exists('data/pre_trained_embeddings')): self.word_list = ['<pad>', '<sos>', '<unk>', '<num>'] tensors = [[], [], [], []] special_word_dict = { '<pad>': 0, '<sos>': 1, '<unk>': 2, '<unknown>': 2, '<num>': 3, '<number>': 3 } is_digit = re.compile(r'^[0-9e\.\-\+]+$') is_in_limited_char_set = re.compile( r'^[A-Za-z0-9,!?\(\)\.\'\`\"\-]+$') with open(self.pretrained_file, 'r') as pt: lines = pt.readlines() num_line = len(lines) for index, line in enumerate(lines): word, *embedding = line.strip().split() embedding = [float(value) for value in embedding] if len(embedding ) < 100: # May be caused by emojis / rear words continue if word in special_word_dict.keys(): tensors[special_word_dict[word]] = embedding elif (is_digit.search(word) is not None or \ is_in_limited_char_set.search(word) is not None) \ and not word.startswith('<'): self.word_list.append(word) tensors.append(embedding) self.logger.d( 'Loading pre-trained embeddings %6d / %6d...' % (index, num_line)) # Check if any special symbol has empty embedding for i in range(4): if len(tensors[i]) == 0: tensors[i] = [0.] * len(tensors[4]) pickle.dump(self.word_list, open('data/pre_trained_word_list', 'wb+')) pickle.dump(tensors, open('data/pre_trained_embeddings', 'wb+')) else: self.logger.i('Pre trained wordlist and embeddings data found!') self.word_list = pickle.load( open('data/pre_trained_word_list', 'rb')) tensors = pickle.load(open('data/pre_trained_embeddings', 'rb')) return tensors def pretrained_embeddings(self): return self.tensors def test_loader(self): return Data.DataLoader(self.dataset, self.batch_size, False) def train_loader(self): return Data.DataLoader(self.train_dataset, self.batch_size, True) def valid_loader(self): return Data.DataLoader(self.valid_dataset, self.batch_size, False)
class SentimentClassification(T.nn.Module): def __init__(self, embedding_path, lr=.01, momentum=.9, batch_size=50, epoch=1000, use_cuda=False, verbose=1, tensorboard=False): super(SentimentClassification, self).__init__() self.embedding_path = embedding_path self.lr = lr self.momentum = momentum self.batch_size = batch_size self.epoch = epoch self.use_cuda = use_cuda self.logger = Logger(verbose) self.tensorboard = tensorboard self._build_model() self.loader = Loader(for_embedding=False, logger=self.logger) if self.tensorboard: from tensorboardX import SummaryWriter self.writer = SummaryWriter('logs') def __del__(self): if self.tensorboard: self.writer.close() def _build_model(self): def init_weight(m): m.weight.data.normal_().mul_( T.FloatTensor([2 / m.weight.data.size()[0]]).sqrt_()) embeddings = np.loadtxt(self.embedding_path) vocab_size, embbeing_len = np.shape(embeddings) self.embedding = T.nn.Embedding(vocab_size, embbeing_len) self.embedding.weight.data.copy_(T.from_numpy(embeddings)) self.embedding.weight.requires_grad = False self.fc = T.nn.Linear(embbeing_len, 2) self.fc.apply(init_weight) self.loss_fn = T.nn.CrossEntropyLoss() self.optimizer = T.optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr, momentum=self.momentum) def forward(self, inputs): embeddings = [self.embedding(doc).mean(dim=0) for doc in inputs] embeddings = T.stack(embeddings) output = self.fc(embeddings) # output = T.nn.functional.softmax(output, dim=1) _, max_indice = T.max(output, dim=1) return output, max_indice def fit(self): self.logger.i('Start training network...', True) try: total_batch_per_epoch = len(self.loader) loss_history = deque(maxlen=50) epoch_index = 0 for epoch_index in range(self.epoch): losses = 0. acc = 0. counter = 0 self.logger.i( '[ %d / %d ] epoch:' % (epoch_index + 1, self.epoch), True) for batch_index, (docs, sentiment) in enumerate(self.loader): docs = [T.autograd.Variable(doc) for doc in docs] sentiment = T.autograd.Variable(sentiment) if self.use_cuda: docs = [doc.cuda() for doc in docs] sentiment = sentiment.cuda() output, predicted = self(docs) acc += ( sentiment.squeeze() == predicted).float().mean().data loss = self.loss_fn(output, sentiment.view(-1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() losses += loss.data.cpu()[0] counter += 1 progress = min( (batch_index + 1) / total_batch_per_epoch * 20., 20.) self.logger.d('[%s] (%3.f%%) loss: %.4f, acc: %.4f' % ('>' * int(progress) + '-' * (20 - int(progress)), progress * 5., losses / counter, acc / counter)) mean_loss = losses / counter if self.tensorboard: self.writer.add_scalar('train_loss', mean_loss, epoch_index) self.writer.add_scalar('train_acc', acc / counter, epoch_index) loss_history.append(mean_loss) if mean_loss > np.mean(loss_history): self.logger.i('Early stopping...', True) break self.logger.d('', True, False) except KeyboardInterrupt: self.logger.i('\n\nInterrupted', True) self.logger.i('Finish', True)