Пример #1
0
    def __init__(self, encoder_params: AttributeDict,
                 decoder_params: AttributeDict):
        super().__init__()
        self.d_model = encoder_params.embedding_dim
        self.n_head = encoder_params.n_head
        self.num_encoder_layers = encoder_params.num_encoder_layer
        self.num_decoder_layers = encoder_params.num_decoder_layer
        self.dim_feedforward = encoder_params.dim_feedforward
        self.dropout = encoder_params.dropout_prob
        self.device = encoder_params.get('device', 'cpu')
        self.max_seq_len = encoder_params.max_seq_len

        self.src_embedding = Embeddings(params=encoder_params)
        self.tgt_embedding = Embeddings(params=decoder_params)

        self.transfomrer = nn.Transformer(
            d_model=self.d_model,
            nhead=self.n_head,
            num_encoder_layers=self.num_encoder_layers,
            num_decoder_layers=self.num_decoder_layers,
            dim_feedforward=self.dim_feedforward,
            dropout=self.dropout)

        self.proj_vocab_layer = nn.Linear(
            in_features=self.d_model, out_features=decoder_params.vocab_size)
        self.apply(self._initailze)
Пример #2
0
    def __init__(self, params: AttributeDict):
        super().__init__()
        # mandatory
        self.d_model = params.d_model
        self.num_heads = params.num_heads
        self.vocab_size = params.vocab_size

        # optional
        self.num_layers = params.get('num_layers', 6)
        self.dim_feed_forward = params.get('dim_feed_forward', 2048)
        self.dropout_prob = params.get('dropout_prob', 0.1)
        self.pe_dropout_prob = params.get('pe_dropout_prob', 0.1)
        self.activation = params.get('activation', 'relu')
        self.max_seq_len = params.get('max_seq_len', 512)
        self.device = params.get('device', 'cpu')

        self.embedding = nn.Embedding(self.vocab_size, self.d_model)
        self.positional_encoding = PositionalEncoding(self.d_model,
                                                      self.pe_dropout_prob,
                                                      self.max_seq_len)
        encoder = TransformerEncoderLayer(
            d_model=self.d_model,
            nhead=self.num_heads,
            dim_feedforward=self.dim_feed_forward,
            dropout=self.dropout_prob,
            activation=self.activation)
        # encoder will be cloned as much as num_layers
        norm = nn.LayerNorm(self.d_model)
        self.encoder_stack = _TransformerEncoder(encoder, self.num_layers,
                                                 norm)
        self._init_parameter()
Пример #3
0
def main():

    args = parse_args()
    setup_logger(args)

    checkpoint_path = 'checkpoint-{}'.format(args.name)
    os.makedirs(checkpoint_path, exist_ok=True)
    with open(args.train_path, 'rb') as f:
        train_data = AttributeDict(pickle.load(f))
    with open(args.valid_path, 'rb') as f:
        valid_data = AttributeDict(pickle.load(f))

    # always use training data dictionaries
    valid_data.e_to_index = train_data.e_to_index
    valid_data.index_to_e = train_data.index_to_e
    valid_data.r_to_index = train_data.r_to_index
    valid_data.index_to_r = train_data.index_to_r

    conv_e = ConvE(num_e=len(train_data.e_to_index),
                   num_r=len(train_data.r_to_index)).cuda()
    criterion = StableBCELoss()
    optimizer = optim.Adam(conv_e.parameters(), lr=0.003)

    for epoch in trange(args.epochs):
        train(epoch, train_data, conv_e, criterion, optimizer, args)
        valid(epoch, train_data, conv_e, args.batch_size, 'train')
        valid(epoch, valid_data, conv_e, args.batch_size, 'valid')

        with open(
                '{}/checkpoint_{}.model'.format(checkpoint_path,
                                                str(epoch + 1).zfill(2)),
                'wb') as f:
            torch.save(conv_e, f)
Пример #4
0
    def __init__(self, encoder_params: AttributeDict):
        super().__init__()
        self.vocab_size = encoder_params.vocab_size
        self.embedding_dim = encoder_params.embedding_dim
        self.hidden_size = encoder_params.hidden_size
        self.bidirectional = encoder_params.get('bidirectional', False)
        self.num_layers = encoder_params.get('num_layers', 1)
        self.dropout_prob = encoder_params.get('dropout_prob', 0.0)
        self.device = encoder_params.get('device', 'cpu')

        self.embedding_lookup = nn.Embedding(self.vocab_size,
                                             self.embedding_dim,
                                             padding_idx=PAD_TOKEN_ID)
        self.rnn = nn.GRU(input_size=self.embedding_dim,
                          hidden_size=self.hidden_size,
                          batch_first=True,
                          num_layers=self.num_layers,
                          bidirectional=self.bidirectional,
                          dropout=self.dropout_prob)
Пример #5
0
    def train(self, train_params: AttributeDict, loss_func, optimizer):
        # Merge common and train params
        params = AttributeDict(self.common_params.copy())
        params.update(train_params)
        self._set_mode(Estimator.Mode.TRAIN)

        encoder_params = params.encoder_params
        decoder_params = params.decoder_params

        src_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.src_corpus_filename)
        tgt_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.tgt_corpus_filename)

        data_loader = self._prepare_data_loader(src_corpus_file_path,
                                                tgt_corpus_file_path, params,
                                                encoder_params.max_seq_len,
                                                decoder_params.max_seq_len)

        epoch = 0
        avg_loss = 0.
        for epoch in range(params.n_epochs):
            avg_loss = self._train_model(data_loader, params, self.model,
                                         loss_func, optimizer, self.device,
                                         epoch + 1)

        save_dir_path = os.path.join(train_params.model_save_directory,
                                     get_checkpoint_dir_path(epoch + 1))
        if not os.path.exists(save_dir_path):
            os.makedirs(save_dir_path)

        # save checkpoint for last epoch
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss
            }, os.path.join(save_dir_path, 'checkpoint.tar'))
Пример #6
0
    def __init__(self, decoder_params: AttributeDict):
        super().__init__()
        self.vocab_size = decoder_params.vocab_size
        self.embedding_dim = decoder_params.embedding_dim
        self.hidden_size = decoder_params.hidden_size
        self.max_seq_len = decoder_params.max_seq_len
        self.num_layers = decoder_params.get('num_layers', 1)
        self.dropout_prob = decoder_params.get('dropout_prob', 0.0)
        self.device = decoder_params.get('device', 'cpu')
        self.beam_size = decoder_params.get('beam_size', 1)

        self.embedding_lookup = nn.Embedding(self.vocab_size,
                                             self.embedding_dim,
                                             padding_idx=PAD_TOKEN_ID)
        self.rnn = nn.GRU(input_size=self.embedding_dim,
                          hidden_size=self.hidden_size,
                          batch_first=True,
                          bidirectional=False,
                          num_layers=self.num_layers,
                          dropout=self.dropout_prob)
        self.linear_transform = nn.Linear(self.hidden_size, self.vocab_size)
        self.decoder_output_func = nn.functional.log_softmax
Пример #7
0
    def eval(self, eval_params: AttributeDict, loss_func):
        self._set_mode(Estimator.Mode.EVAL)
        params = AttributeDict(self.common_params.copy())
        params.update(eval_params)
        encoder_params = params.encoder_params
        decoder_params = params.decoder_params

        # load checkpoint
        checkpoint = self._load_checkpoint(params)
        self.model.load_state_dict(checkpoint['model_state_dict'])

        src_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.src_corpus_filename)
        tgt_corpus_file_path = os.path.join(self.data_set_dir,
                                            params.tgt_corpus_filename)

        data_loader = self._prepare_data_loader(src_corpus_file_path,
                                                tgt_corpus_file_path, params,
                                                encoder_params.max_seq_len,
                                                decoder_params.max_seq_len)
        avg_loss, bleu_score = self._eval_model(data_loader, params,
                                                self.model, loss_func,
                                                self.device, self.tgt_id2word)
        print(f'Avg loss: {avg_loss:05.3f}, BLEU score: {bleu_score}')
Пример #8
0
def preprocess_valid(train_path, valid_path):
    x, y = list(), list()
    with open(train_path, 'rb') as f:
        train_data = AttributeDict(pickle.load(f))

    s_dict = read_data(valid_path)
    for s, ro in s_dict.items():
        try:
            _ = train_data.e_to_index[s]
        except KeyError:
            continue

        for r, objects in ro.items():
            try:
                _ = train_data.r_to_index[r]
            except KeyError:
                continue

            filtered_objects = list()

            for o in objects:
                # sometimes an entity only occurs as an object
                try:
                    _ = train_data.e_to_index[o]
                    filtered_objects.append(o)
                except KeyError:
                    continue

            x.append((s, r))
            y.append(filtered_objects)

    data = {
        'x': x,
        'y': y,
    }

    save_file_path = os.path.splitext(valid_path)[0] + '.pkl'
    pickle.dump(data, open(save_file_path, 'wb'))
Пример #9
0
    def __init__(self, device: str, common_params: AttributeDict):
        self.device = device
        self.common_params = common_params
        encoder_params = AttributeDict(self.common_params.encoder_params)
        decoder_params = AttributeDict(self.common_params.decoder_params)
        self.common_params.encoder_params = encoder_params
        self.common_params.decoder_params = decoder_params
        encoder_params.device = self.device
        decoder_params.device = self.device
        self.mode = None

        self.base_dir = os.getcwd()
        self.data_set_dir = os.path.join(self.base_dir, 'dataset')

        self.src_tokenizer = common_params.src_tokenizer()
        self.tgt_tokenizer = common_params.tgt_tokenizer()

        self.src_vocab_file_path = os.path.join(
            self.data_set_dir, common_params.src_vocab_filename)
        self.tgt_vocab_file_path = os.path.join(
            self.data_set_dir, common_params.tgt_vocab_filename)
        self.src_word_embedding_file_path = os.path.join(
            self.data_set_dir,
            common_params.get('src_word_embedding_filename', None))
        self.tgt_word_embedding_file_path = os.path.join(
            self.data_set_dir,
            common_params.get('tgt_word_embedding_filename', None))

        self.src_word2id, self.src_id2word, self.src_embedding_weight = self._build_vocab(
            self.src_vocab_file_path,
            self.src_word_embedding_file_path,
        )
        if encoder_params.get('vocab_size', None) is None:
            encoder_params.vocab_size = len(self.src_word2id)

        self.tgt_word2id, self.tgt_id2word, self.tgt_embedding_weight = self._build_vocab(
            self.tgt_vocab_file_path, self.tgt_word_embedding_file_path)
        if decoder_params.get('vocab_size', None) is None:
            decoder_params.vocab_size = len(self.tgt_word2id)

        self.model: nn.Module = self._build_model(self.common_params,
                                                  self.device)
Пример #10
0
def check_params(config: AttributeDict):
    assert isinstance(config.get('learning_rate'), float), \
        'learning_rate should be float value.'
    assert config.get('src_tokenizer', '') in [
        MecabTokenizer, NltkTokenizer
    ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]'
    assert config.get('tgt_tokenizer', '') in [
        MecabTokenizer, NltkTokenizer
    ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]'
    assert config.get('src_vocab_filename', None) is not None, \
        'src_vocab_filename must not be None'
    assert config.get('tgt_vocab_filename', None) is not None, \
        'tgt_vocab_filename must not be None'
    assert config.get('src_word_embedding_filename', None) is not None, \
        'src_word_embedding_filename must not be None'
    assert config.get('tgt_word_embedding_filename', None) is not None, \
        'tgt_word_embedding_filename must not be None'
    assert config.get('src_corpus_filename', None) is not None, \
        'src_corpus_filename must not be None'
    assert config.get('tgt_corpus_filename', None) is not None, \
        'tgt_corpus_filename must not be None'
    assert config.get('encoder', None) is not None, \
        'encoder should not be None'
    assert config.get('decoder', None) is not None, \
        'decoder should not be None'
Пример #11
0
def check_params(config: AttributeDict):
    assert config.get('src_tokenizer', '') in [
        MecabTokenizer, NltkTokenizer
    ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]'
    assert config.get('tgt_tokenizer', '') in [
        MecabTokenizer, NltkTokenizer
    ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]'
    assert config.get('src_vocab_filename', None) is not None, \
        'src_vocab_filename must not be None'
    assert config.get('tgt_vocab_filename', None) is not None, \
        'tgt_vocab_filename must not be None'
    assert config.get('src_word_embedding_filename', None) is not None, \
        'src_word_embedding_filename must not be None'
    assert config.get('tgt_word_embedding_filename', None) is not None, \
        'tgt_word_embedding_filename must not be None'
    assert config.get('src_corpus_filename', None) is not None, \
        'src_corpus_filename must not be None'
    assert config.get('tgt_corpus_filename', None) is not None, \
        'tgt_corpus_filename must not be None'
    assert config.get('encoder', None) is not None, \
        'encoder should not be None'
    assert config.get('decoder', None) is not None, \
        'decoder should not be None'
    assert config.get('checkpoint_path', None) is not None, \
        'model_path should not be None'
Пример #12
0
from __future__ import print_function
from __future__ import unicode_literals

from module import GruEncoder, GruDecoder
from module.transformer import Transformer
from module.tokenizer import NltkTokenizer
from util import AttributeDict

train_params = AttributeDict({
    "n_epochs": 5,
    "batch_size": 64,
    "learning_rate": 1e-4,
    "src_tokenizer": NltkTokenizer,
    "tgt_tokenizer": NltkTokenizer,
    "src_vocab_filename": "src_vocab.txt",
    "src_word_embedding_filename": "src_word_embedding.npy",
    "tgt_vocab_filename": "tgt_vocab.txt",
    "tgt_word_embedding_filename": "tgt_word_embedding.npy",
    "src_corpus_filename": "korean-english-park.train.ko",
    "tgt_corpus_filename": "korean-english-park.train.en",
    "encoder": Transformer,
    "decoder": Transformer,
    "model_save_directory": "kor2eng-gru-gru"
})

eval_params = AttributeDict({
    "batch_size":
    64,
    "src_tokenizer":
    NltkTokenizer,
    "tgt_tokenizer":
    NltkTokenizer,
Пример #13
0
common_params = AttributeDict({
    "model": Seq2Seq,
    "src_tokenizer": MecabTokenizer,
    "tgt_tokenizer": MecabTokenizer,
    "src_vocab_filename": "kor-mecab-fasttext",
    "tgt_vocab_filename": "eng-mecab-fasttext",
    "src_word_embedding_filename": "kor-mecab-fasttext-512d.npy",
    "tgt_word_embedding_filename": "eng-mecab-fasttext-512d.npy",
    "encoder_params": {
        "model": TransformerEncoder,
        "embedding_dim": 512,
        "d_model": 512,
        "num_heads": 8,
        "num_layers": 6,
        "dim_feed_forward": 2048,
        "pe_dropout_prob": 0.1,
        "dropout_prob": 0.1,
        "activation": "relu",
        "max_seq_len": 128,
    },
    "decoder_params": {
        "model": TransformerDecoder,
        "embedding_dim": 512,
        "d_model": 512,
        "num_heads": 8,
        "num_layers": 6,
        "dim_feed_forward": 2048,
        "pe_dropout_prob": 0.1,
        "dropout_prob": 0.1,
        "activation": "relu",
        "max_seq_len": 128,
    }
})