예제 #1
0
파일: train.py 프로젝트: meshiguge/ConvE-1
def main():

    args = parse_args()
    setup_logger(args)

    checkpoint_path = 'checkpoint-{}'.format(args.name)
    os.makedirs(checkpoint_path, exist_ok=True)
    with open(args.train_path, 'rb') as f:
        train_data = AttributeDict(pickle.load(f))
    with open(args.valid_path, 'rb') as f:
        valid_data = AttributeDict(pickle.load(f))

    # always use training data dictionaries
    valid_data.e_to_index = train_data.e_to_index
    valid_data.index_to_e = train_data.index_to_e
    valid_data.r_to_index = train_data.r_to_index
    valid_data.index_to_r = train_data.index_to_r

    conv_e = ConvE(num_e=len(train_data.e_to_index),
                   num_r=len(train_data.r_to_index)).cuda()
    criterion = StableBCELoss()
    optimizer = optim.Adam(conv_e.parameters(), lr=0.003)

    for epoch in trange(args.epochs):
        train(epoch, train_data, conv_e, criterion, optimizer, args)
        valid(epoch, train_data, conv_e, args.batch_size, 'train')
        valid(epoch, valid_data, conv_e, args.batch_size, 'valid')

        with open(
                '{}/checkpoint_{}.model'.format(checkpoint_path,
                                                str(epoch + 1).zfill(2)),
                'wb') as f:
            torch.save(conv_e, f)
예제 #2
0
    def __init__(self, device: str, common_params: AttributeDict):
        self.device = device
        self.common_params = common_params
        encoder_params = AttributeDict(self.common_params.encoder_params)
        decoder_params = AttributeDict(self.common_params.decoder_params)
        self.common_params.encoder_params = encoder_params
        self.common_params.decoder_params = decoder_params
        encoder_params.device = self.device
        decoder_params.device = self.device
        self.mode = None

        self.base_dir = os.getcwd()
        self.data_set_dir = os.path.join(self.base_dir, 'dataset')

        self.src_tokenizer = common_params.src_tokenizer()
        self.tgt_tokenizer = common_params.tgt_tokenizer()

        self.src_vocab_file_path = os.path.join(
            self.data_set_dir, common_params.src_vocab_filename)
        self.tgt_vocab_file_path = os.path.join(
            self.data_set_dir, common_params.tgt_vocab_filename)
        self.src_word_embedding_file_path = os.path.join(
            self.data_set_dir,
            common_params.get('src_word_embedding_filename', None))
        self.tgt_word_embedding_file_path = os.path.join(
            self.data_set_dir,
            common_params.get('tgt_word_embedding_filename', None))

        self.src_word2id, self.src_id2word, self.src_embedding_weight = self._build_vocab(
            self.src_vocab_file_path,
            self.src_word_embedding_file_path,
        )
        if encoder_params.get('vocab_size', None) is None:
            encoder_params.vocab_size = len(self.src_word2id)

        self.tgt_word2id, self.tgt_id2word, self.tgt_embedding_weight = self._build_vocab(
            self.tgt_vocab_file_path, self.tgt_word_embedding_file_path)
        if decoder_params.get('vocab_size', None) is None:
            decoder_params.vocab_size = len(self.tgt_word2id)

        self.model: nn.Module = self._build_model(self.common_params,
                                                  self.device)
예제 #3
0
    def train(self, train_params: AttributeDict, loss_func, optimizer):
        # Merge common and train params
        params = AttributeDict(self.common_params.copy())
        params.update(train_params)
        self._set_mode(Estimator.Mode.TRAIN)

        encoder_params = params.encoder_params
        decoder_params = params.decoder_params

        src_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.src_corpus_filename)
        tgt_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.tgt_corpus_filename)

        data_loader = self._prepare_data_loader(src_corpus_file_path,
                                                tgt_corpus_file_path, params,
                                                encoder_params.max_seq_len,
                                                decoder_params.max_seq_len)

        epoch = 0
        avg_loss = 0.
        for epoch in range(params.n_epochs):
            avg_loss = self._train_model(data_loader, params, self.model,
                                         loss_func, optimizer, self.device,
                                         epoch + 1)

        save_dir_path = os.path.join(train_params.model_save_directory,
                                     get_checkpoint_dir_path(epoch + 1))
        if not os.path.exists(save_dir_path):
            os.makedirs(save_dir_path)

        # save checkpoint for last epoch
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss
            }, os.path.join(save_dir_path, 'checkpoint.tar'))
예제 #4
0
def preprocess_valid(train_path, valid_path):
    x, y = list(), list()
    with open(train_path, 'rb') as f:
        train_data = AttributeDict(pickle.load(f))

    s_dict = read_data(valid_path)
    for s, ro in s_dict.items():
        try:
            _ = train_data.e_to_index[s]
        except KeyError:
            continue

        for r, objects in ro.items():
            try:
                _ = train_data.r_to_index[r]
            except KeyError:
                continue

            filtered_objects = list()

            for o in objects:
                # sometimes an entity only occurs as an object
                try:
                    _ = train_data.e_to_index[o]
                    filtered_objects.append(o)
                except KeyError:
                    continue

            x.append((s, r))
            y.append(filtered_objects)

    data = {
        'x': x,
        'y': y,
    }

    save_file_path = os.path.splitext(valid_path)[0] + '.pkl'
    pickle.dump(data, open(save_file_path, 'wb'))
예제 #5
0
    def eval(self, eval_params: AttributeDict, loss_func):
        self._set_mode(Estimator.Mode.EVAL)
        params = AttributeDict(self.common_params.copy())
        params.update(eval_params)
        encoder_params = params.encoder_params
        decoder_params = params.decoder_params

        # load checkpoint
        checkpoint = self._load_checkpoint(params)
        self.model.load_state_dict(checkpoint['model_state_dict'])

        src_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.src_corpus_filename)
        tgt_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.tgt_corpus_filename)

        data_loader = self._prepare_data_loader(src_corpus_file_path,
                                                tgt_corpus_file_path, params,
                                                encoder_params.max_seq_len,
                                                decoder_params.max_seq_len)
        avg_loss, bleu_score = self._eval_model(data_loader, params,
                                                self.model, loss_func,
                                                self.device, self.tgt_id2word)
        print(f'Avg loss: {avg_loss:05.3f}, BLEU score: {bleu_score}')
예제 #6
0
from __future__ import print_function
from __future__ import unicode_literals

from module import GruEncoder, GruDecoder
from module.transformer import Transformer
from module.tokenizer import NltkTokenizer
from util import AttributeDict

train_params = AttributeDict({
    "n_epochs": 5,
    "batch_size": 64,
    "learning_rate": 1e-4,
    "src_tokenizer": NltkTokenizer,
    "tgt_tokenizer": NltkTokenizer,
    "src_vocab_filename": "src_vocab.txt",
    "src_word_embedding_filename": "src_word_embedding.npy",
    "tgt_vocab_filename": "tgt_vocab.txt",
    "tgt_word_embedding_filename": "tgt_word_embedding.npy",
    "src_corpus_filename": "korean-english-park.train.ko",
    "tgt_corpus_filename": "korean-english-park.train.en",
    "encoder": Transformer,
    "decoder": Transformer,
    "model_save_directory": "kor2eng-gru-gru"
})

eval_params = AttributeDict({
    "batch_size":
    64,
    "src_tokenizer":
    NltkTokenizer,
    "tgt_tokenizer":
    NltkTokenizer,
예제 #7
0
common_params = AttributeDict({
    "model": Seq2Seq,
    "src_tokenizer": MecabTokenizer,
    "tgt_tokenizer": MecabTokenizer,
    "src_vocab_filename": "kor-mecab-fasttext",
    "tgt_vocab_filename": "eng-mecab-fasttext",
    "src_word_embedding_filename": "kor-mecab-fasttext-512d.npy",
    "tgt_word_embedding_filename": "eng-mecab-fasttext-512d.npy",
    "encoder_params": {
        "model": TransformerEncoder,
        "embedding_dim": 512,
        "d_model": 512,
        "num_heads": 8,
        "num_layers": 6,
        "dim_feed_forward": 2048,
        "pe_dropout_prob": 0.1,
        "dropout_prob": 0.1,
        "activation": "relu",
        "max_seq_len": 128,
    },
    "decoder_params": {
        "model": TransformerDecoder,
        "embedding_dim": 512,
        "d_model": 512,
        "num_heads": 8,
        "num_layers": 6,
        "dim_feed_forward": 2048,
        "pe_dropout_prob": 0.1,
        "dropout_prob": 0.1,
        "activation": "relu",
        "max_seq_len": 128,
    }
})