def test_fit_with_replace_mincount_min_count_ngram(fname): dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=3, replace_word="<UNK>", size_word_n_gram=2, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) # word vocab related test assert len(dictionary.word_vocab) == 5 # c d e f <UNK> assert dictionary.size_word_vocab == 5 assert dictionary.num_words == np.sum(np.arange(7)) # n-gram related test assert len( dictionary.ngram_vocab) == 5 # c-c d-d e-e f-f <UNK>-<UNK> assert np.sum(dictionary.ngram_vocab.id2freq) == 2 + 3 + 4 + 5 + 1 # label related test assert len(dictionary.label_vocab) == 6 assert dictionary.size_total_vocab == 10
def test_fit_without_replacement_with_mincount(fname): dictionary = SupervisedDictionary(replace_OOV_word=False, min_count=1, replace_word="<UNK>", size_word_n_gram=2, word_n_gram_min_count=2, label_separator="\t", line_break_word="") dictionary.fit(fname) # word vocab related test assert len(dictionary.word_vocab) == 6 # a b c d e f assert dictionary.size_word_vocab == 6 assert dictionary.num_words == np.sum(np.arange(7)) # n-gram related test assert len(dictionary.ngram_vocab) == 4 # c-c d-d e-e f-f assert np.sum( dictionary.ngram_vocab.id2freq) == np.sum(np.arange(6)) - 1 # label related test assert len(dictionary.label_vocab) == 6 assert dictionary.size_total_vocab == 10
def test_fit_with_replace_mincount(fname): dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=3, replace_word="<UNK>", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="</s>") dictionary.fit(fname) # word vocab related test assert len(dictionary.word_vocab) == 6 # <UNK> c d e f </s> assert dictionary.size_word_vocab == 6 assert dictionary.num_words == np.sum(np.arange(7)) + 6 assert dictionary.size_total_vocab == 6 # n-gram related test assert len(dictionary.ngram_vocab) == 0 # label related test assert len(dictionary.label_vocab) == 6
def test_without_ngram(fname): dictionary = SupervisedDictionary(replace_OOV_word=False, min_count=2, replace_word="", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) X, y = dictionary.transform(fname) assert len(X[0]) == 0 assert y[0] == 0 np.testing.assert_array_equal(X[-1], np.zeros(6, dtype=np.int64)) recovered_sentence = dictionary.recover_sentence_from_ids(X[1]) assert recovered_sentence == ["b", "b"]
def test_with_ngram(fname): dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=3, replace_word="<UNK>", size_word_n_gram=2, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) X, y = dictionary.transform(fname) assert len(X[1]) == 3 # <unk>-<unk> <unk> <unk> recovered_sentence = dictionary.recover_sentence_from_ids(X[1]) assert recovered_sentence == ["<UNK>", "<UNK>", "<UNK>-<UNK>"] recovered_sentence = dictionary.recover_sentence_from_ids(X[3]) assert recovered_sentence == [ "d", "d", "d", "d", "d-d", "d-d", "d-d" ] dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=3, replace_word="<UNK>", size_word_n_gram=2, word_n_gram_min_count=3, label_separator="\t", line_break_word="") dictionary.fit(fname) X, y = dictionary.transform(fname) print(X) assert len(X[1]) == 2 # <unk> <unk> recovered_sentence = dictionary.recover_sentence_from_ids(X[1]) assert recovered_sentence == ["<UNK>", "<UNK>"] recovered_sentence = dictionary.recover_sentence_from_ids(X[2]) assert recovered_sentence == ["c", "c", "c"]
def test_predefined_vocab(fname): # min count == 1 dictionary = SupervisedDictionary(replace_OOV_word=False, min_count=1, replace_word="", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB) # word vocab related test assert len(dictionary.word_vocab) == 3 # a b c assert dictionary.size_word_vocab == 3 assert dictionary.num_words == 1 + 2 + 3 assert dictionary.size_total_vocab == 3 # n-gram related test assert len(dictionary.ngram_vocab) == 0 # label related test assert len(dictionary.label_vocab) == 6 dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=1, replace_word="<UNK>", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB) # word vocab related test assert len(dictionary.word_vocab) == 4 # a b c <UNK> assert dictionary.size_word_vocab == 4 assert dictionary.num_words == np.sum(np.arange(7)) assert dictionary.size_total_vocab == 4 # n-gram related test assert len(dictionary.ngram_vocab) == 0 # label related test assert len(dictionary.label_vocab) == 6 # min_count == 2 dictionary = SupervisedDictionary(replace_OOV_word=False, min_count=2, replace_word="", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB) # word vocab related test assert len(dictionary.word_vocab) == 2 # b c assert dictionary.size_word_vocab == 2 assert dictionary.num_words == 2 + 3 assert dictionary.size_total_vocab == 2 # n-gram related test assert len(dictionary.ngram_vocab) == 0 # label related test assert len(dictionary.label_vocab) == 6 dictionary = SupervisedDictionary(replace_OOV_word=True, min_count=2, replace_word="<UNK>", size_word_n_gram=1, word_n_gram_min_count=1, label_separator="\t", line_break_word="") dictionary.fit(fname) print(dictionary.word_vocab.id2word) print(dictionary.word_vocab.word2id) dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB) print(dictionary.word_vocab.id2word) print(dictionary.word_vocab.word2id) # word vocab related test assert len(dictionary.word_vocab) == 3 # b c <UNK> assert dictionary.size_word_vocab == 3 assert dictionary.num_words == np.sum(np.arange(7)) assert dictionary.size_total_vocab == 3 # n-gram related test assert len(dictionary.ngram_vocab) == 0 # label related test assert len(dictionary.label_vocab) == 6
def __init__(self, hydra_cfg, logger): self.logger = logger self.hydra_cfg = hydra_cfg self.seed = hydra_cfg['parameters']['seed'] self.metric = hydra_cfg['parameters']['metric'] self.device = torch.device( 'cuda:{}'.format(hydra_cfg['parameters']['gpu_id'] ) if torch.cuda.is_available() else 'cpu') working_dir = utils.get_original_cwd() + '/' training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[ 'dataset']['train_fname'] is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0 # load embeddings pretrained_path = hydra_cfg['parameters']['pre_trained'] pretrained_vocab = {} if pretrained_path: pretrained_path = working_dir + hydra_cfg['parameters'][ 'pre_trained'] self.logger.info('Loading pre-trained word embeddings {}\n'.format( pretrained_path)) pretrained_w2v = KeyedVectors.load_word2vec_format( fname=pretrained_path) pretrained_vocab = set(pretrained_w2v.vocab.keys()) assert hydra_cfg['parameters']['ngram'] == 1 self.dictionary = SupervisedDictionary( replace_OOV_word=is_replaced_OOV, min_count=hydra_cfg['parameters']['min_count'], replace_word='<OOV>', size_word_n_gram=hydra_cfg['parameters']['ngram'], word_n_gram_min_count=hydra_cfg['parameters'] ['word_n_gram_min_count'], label_separator=hydra_cfg['parameters']['label_separator'], line_break_word='') self.logger.info('Use {}\n'.format(self.device)) self.dictionary.fit(training_path) if pretrained_vocab: self.dictionary.update_vocab_from_word_set(pretrained_vocab) self.train_set, self.val_set = get_datasets( cfg=hydra_cfg, dictionary=self.dictionary, working_dir=working_dir, training_path=training_path, include_test=False) pretrained_word_vectors = None dim = self.hydra_cfg['parameters']['dim'] self.pooling = self.hydra_cfg['parameters']['pooling'] OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov'] self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0 if pretrained_word_vectors: pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings( pretrained_w2v, self.dictionary, OOV_initialized_method, rnd=np.random.RandomState(self.seed)) dim = pretrained_word_vectors.shape[1] self.pretrained_word_vectors = pretrained_word_vectors self.dim = dim self.logger.info('#training_data: {}, #val_data: {}\n'.format( len(self.train_set), len(self.val_set))) self.logger.info( 'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n' .format(self.dictionary.size_word_vocab, self.dictionary.size_ngram_vocab, self.dictionary.size_total_vocab))
class Objective(object): def __init__(self, hydra_cfg, logger): self.logger = logger self.hydra_cfg = hydra_cfg self.seed = hydra_cfg['parameters']['seed'] self.metric = hydra_cfg['parameters']['metric'] self.device = torch.device( 'cuda:{}'.format(hydra_cfg['parameters']['gpu_id'] ) if torch.cuda.is_available() else 'cpu') working_dir = utils.get_original_cwd() + '/' training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[ 'dataset']['train_fname'] is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0 # load embeddings pretrained_path = hydra_cfg['parameters']['pre_trained'] pretrained_vocab = {} if pretrained_path: pretrained_path = working_dir + hydra_cfg['parameters'][ 'pre_trained'] self.logger.info('Loading pre-trained word embeddings {}\n'.format( pretrained_path)) pretrained_w2v = KeyedVectors.load_word2vec_format( fname=pretrained_path) pretrained_vocab = set(pretrained_w2v.vocab.keys()) assert hydra_cfg['parameters']['ngram'] == 1 self.dictionary = SupervisedDictionary( replace_OOV_word=is_replaced_OOV, min_count=hydra_cfg['parameters']['min_count'], replace_word='<OOV>', size_word_n_gram=hydra_cfg['parameters']['ngram'], word_n_gram_min_count=hydra_cfg['parameters'] ['word_n_gram_min_count'], label_separator=hydra_cfg['parameters']['label_separator'], line_break_word='') self.logger.info('Use {}\n'.format(self.device)) self.dictionary.fit(training_path) if pretrained_vocab: self.dictionary.update_vocab_from_word_set(pretrained_vocab) self.train_set, self.val_set = get_datasets( cfg=hydra_cfg, dictionary=self.dictionary, working_dir=working_dir, training_path=training_path, include_test=False) pretrained_word_vectors = None dim = self.hydra_cfg['parameters']['dim'] self.pooling = self.hydra_cfg['parameters']['pooling'] OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov'] self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0 if pretrained_word_vectors: pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings( pretrained_w2v, self.dictionary, OOV_initialized_method, rnd=np.random.RandomState(self.seed)) dim = pretrained_word_vectors.shape[1] self.pretrained_word_vectors = pretrained_word_vectors self.dim = dim self.logger.info('#training_data: {}, #val_data: {}\n'.format( len(self.train_set), len(self.val_set))) self.logger.info( 'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n' .format(self.dictionary.size_word_vocab, self.dictionary.size_ngram_vocab, self.dictionary.size_total_vocab)) def __call__(self, trial: optuna.Trial): torch.manual_seed(self.seed) random.seed(self.seed) train_data_loader, val_data_loader = datasets2data_loaders( self.train_set, self.val_set, test_set=None, num_workers=1) epochs = self.hydra_cfg['parameters']['epochs'] # Calculate an objective value by using the extra arguments. model = SupervisedFastText(V=self.dictionary.size_total_vocab, num_classes=len( self.dictionary.label_vocab), embedding_dim=self.dim, pretrained_emb=self.pretrained_word_vectors, freeze=self.is_freeze, pooling=self.pooling).to(self.device) initial_lr = trial.suggest_loguniform( 'lr', self.hydra_cfg['optuna']['lr_min'], self.hydra_cfg['optuna']['lr_max']) optimizer = optim.SGD(model.parameters(), lr=initial_lr) # parameters for update learning rate num_tokens = self.dictionary.num_words learning_rate_schedule = self.hydra_cfg['parameters']['lr_update_rate'] total_num_processed_tokens_in_training = epochs * num_tokens num_processed_tokens = 0 local_processed_tokens = 0 N = len(train_data_loader.dataset) best_val_loss = np.finfo(0.).max best_val_acc = np.finfo(0.).min save_fname = os.getcwd() + '/' + '{}.pt'.format( trial.number) # file name to store best model's weights for epoch in range(epochs): # begin training phase sum_loss = 0. correct = 0 model.train() for sentence, label, n_tokens in train_data_loader: sentence, label = sentence.to(self.device), label.to( self.device) optimizer.zero_grad() output = model(sentence) loss = F.nll_loss(output, label) loss.backward() optimizer.step() pred = output.argmax(1, keepdim=False) correct += pred.eq(label).sum().item() sum_loss += loss.item() # update learning rate # ref: https://github.com/facebookresearch/fastText/blob/6d7c77cd33b23eec26198fdfe10419476b5364c7/src/fasttext.cc#L656 local_processed_tokens += n_tokens.item() if local_processed_tokens > learning_rate_schedule: num_processed_tokens += local_processed_tokens local_processed_tokens = 0 progress = num_processed_tokens / total_num_processed_tokens_in_training optimizer.param_groups[0]['lr'] = initial_lr * (1. - progress) train_loss = sum_loss / N train_acc = correct / N # end training phase val_loss, val_acc = evaluation(model, self.device, val_data_loader) progress = num_processed_tokens / total_num_processed_tokens_in_training # approximated progress self.logger.info( '\rProgress: {:.1f}% Avg. train loss: {:.4f}, train acc: {:.1f}%, ' 'Avg. val loss: {:.4f}, val acc: {:.1f}%'.format( progress * 100., train_loss, train_acc * 100, val_loss, val_acc * 100)) if self.metric == 'loss': trial.report(val_loss, epoch) else: trial.report(val_acc, epoch) if trial.should_prune(): raise optuna.exceptions.TrialPruned() # validation is_saved_model = False if self.metric == 'loss': if best_val_loss > val_loss: best_val_loss = val_loss best_val_acc = val_acc is_saved_model = True else: if best_val_acc < val_acc: best_val_loss = val_loss best_val_acc = val_acc is_saved_model = True if is_saved_model: torch.save(model.state_dict(), save_fname) trial.set_user_attr('val_loss', best_val_loss) trial.set_user_attr('val_acc', best_val_acc) trial.set_user_attr('model_path', save_fname) if self.metric == 'loss': return best_val_loss else: return best_val_acc