def main(): args = parse_args() setup_logger(args) checkpoint_path = 'checkpoint-{}'.format(args.name) os.makedirs(checkpoint_path, exist_ok=True) with open(args.train_path, 'rb') as f: train_data = AttributeDict(pickle.load(f)) with open(args.valid_path, 'rb') as f: valid_data = AttributeDict(pickle.load(f)) # always use training data dictionaries valid_data.e_to_index = train_data.e_to_index valid_data.index_to_e = train_data.index_to_e valid_data.r_to_index = train_data.r_to_index valid_data.index_to_r = train_data.index_to_r conv_e = ConvE(num_e=len(train_data.e_to_index), num_r=len(train_data.r_to_index)).cuda() criterion = StableBCELoss() optimizer = optim.Adam(conv_e.parameters(), lr=0.003) for epoch in trange(args.epochs): train(epoch, train_data, conv_e, criterion, optimizer, args) valid(epoch, train_data, conv_e, args.batch_size, 'train') valid(epoch, valid_data, conv_e, args.batch_size, 'valid') with open( '{}/checkpoint_{}.model'.format(checkpoint_path, str(epoch + 1).zfill(2)), 'wb') as f: torch.save(conv_e, f)
def __init__(self, device: str, common_params: AttributeDict): self.device = device self.common_params = common_params encoder_params = AttributeDict(self.common_params.encoder_params) decoder_params = AttributeDict(self.common_params.decoder_params) self.common_params.encoder_params = encoder_params self.common_params.decoder_params = decoder_params encoder_params.device = self.device decoder_params.device = self.device self.mode = None self.base_dir = os.getcwd() self.data_set_dir = os.path.join(self.base_dir, 'dataset') self.src_tokenizer = common_params.src_tokenizer() self.tgt_tokenizer = common_params.tgt_tokenizer() self.src_vocab_file_path = os.path.join( self.data_set_dir, common_params.src_vocab_filename) self.tgt_vocab_file_path = os.path.join( self.data_set_dir, common_params.tgt_vocab_filename) self.src_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('src_word_embedding_filename', None)) self.tgt_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('tgt_word_embedding_filename', None)) self.src_word2id, self.src_id2word, self.src_embedding_weight = self._build_vocab( self.src_vocab_file_path, self.src_word_embedding_file_path, ) if encoder_params.get('vocab_size', None) is None: encoder_params.vocab_size = len(self.src_word2id) self.tgt_word2id, self.tgt_id2word, self.tgt_embedding_weight = self._build_vocab( self.tgt_vocab_file_path, self.tgt_word_embedding_file_path) if decoder_params.get('vocab_size', None) is None: decoder_params.vocab_size = len(self.tgt_word2id) self.model: nn.Module = self._build_model(self.common_params, self.device)
def train(self, train_params: AttributeDict, loss_func, optimizer): # Merge common and train params params = AttributeDict(self.common_params.copy()) params.update(train_params) self._set_mode(Estimator.Mode.TRAIN) encoder_params = params.encoder_params decoder_params = params.decoder_params src_corpus_file_path = os.path.join(self.data_set_dir, params.src_corpus_filename) tgt_corpus_file_path = os.path.join(self.data_set_dir, params.tgt_corpus_filename) data_loader = self._prepare_data_loader(src_corpus_file_path, tgt_corpus_file_path, params, encoder_params.max_seq_len, decoder_params.max_seq_len) epoch = 0 avg_loss = 0. for epoch in range(params.n_epochs): avg_loss = self._train_model(data_loader, params, self.model, loss_func, optimizer, self.device, epoch + 1) save_dir_path = os.path.join(train_params.model_save_directory, get_checkpoint_dir_path(epoch + 1)) if not os.path.exists(save_dir_path): os.makedirs(save_dir_path) # save checkpoint for last epoch torch.save( { 'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': avg_loss }, os.path.join(save_dir_path, 'checkpoint.tar'))
def preprocess_valid(train_path, valid_path): x, y = list(), list() with open(train_path, 'rb') as f: train_data = AttributeDict(pickle.load(f)) s_dict = read_data(valid_path) for s, ro in s_dict.items(): try: _ = train_data.e_to_index[s] except KeyError: continue for r, objects in ro.items(): try: _ = train_data.r_to_index[r] except KeyError: continue filtered_objects = list() for o in objects: # sometimes an entity only occurs as an object try: _ = train_data.e_to_index[o] filtered_objects.append(o) except KeyError: continue x.append((s, r)) y.append(filtered_objects) data = { 'x': x, 'y': y, } save_file_path = os.path.splitext(valid_path)[0] + '.pkl' pickle.dump(data, open(save_file_path, 'wb'))
def eval(self, eval_params: AttributeDict, loss_func): self._set_mode(Estimator.Mode.EVAL) params = AttributeDict(self.common_params.copy()) params.update(eval_params) encoder_params = params.encoder_params decoder_params = params.decoder_params # load checkpoint checkpoint = self._load_checkpoint(params) self.model.load_state_dict(checkpoint['model_state_dict']) src_corpus_file_path = os.path.join(self.data_set_dir, params.src_corpus_filename) tgt_corpus_file_path = os.path.join(self.data_set_dir, params.tgt_corpus_filename) data_loader = self._prepare_data_loader(src_corpus_file_path, tgt_corpus_file_path, params, encoder_params.max_seq_len, decoder_params.max_seq_len) avg_loss, bleu_score = self._eval_model(data_loader, params, self.model, loss_func, self.device, self.tgt_id2word) print(f'Avg loss: {avg_loss:05.3f}, BLEU score: {bleu_score}')
from __future__ import print_function from __future__ import unicode_literals from module import GruEncoder, GruDecoder from module.transformer import Transformer from module.tokenizer import NltkTokenizer from util import AttributeDict train_params = AttributeDict({ "n_epochs": 5, "batch_size": 64, "learning_rate": 1e-4, "src_tokenizer": NltkTokenizer, "tgt_tokenizer": NltkTokenizer, "src_vocab_filename": "src_vocab.txt", "src_word_embedding_filename": "src_word_embedding.npy", "tgt_vocab_filename": "tgt_vocab.txt", "tgt_word_embedding_filename": "tgt_word_embedding.npy", "src_corpus_filename": "korean-english-park.train.ko", "tgt_corpus_filename": "korean-english-park.train.en", "encoder": Transformer, "decoder": Transformer, "model_save_directory": "kor2eng-gru-gru" }) eval_params = AttributeDict({ "batch_size": 64, "src_tokenizer": NltkTokenizer, "tgt_tokenizer": NltkTokenizer,
common_params = AttributeDict({ "model": Seq2Seq, "src_tokenizer": MecabTokenizer, "tgt_tokenizer": MecabTokenizer, "src_vocab_filename": "kor-mecab-fasttext", "tgt_vocab_filename": "eng-mecab-fasttext", "src_word_embedding_filename": "kor-mecab-fasttext-512d.npy", "tgt_word_embedding_filename": "eng-mecab-fasttext-512d.npy", "encoder_params": { "model": TransformerEncoder, "embedding_dim": 512, "d_model": 512, "num_heads": 8, "num_layers": 6, "dim_feed_forward": 2048, "pe_dropout_prob": 0.1, "dropout_prob": 0.1, "activation": "relu", "max_seq_len": 128, }, "decoder_params": { "model": TransformerDecoder, "embedding_dim": 512, "d_model": 512, "num_heads": 8, "num_layers": 6, "dim_feed_forward": 2048, "pe_dropout_prob": 0.1, "dropout_prob": 0.1, "activation": "relu", "max_seq_len": 128, } })