예제 #1
0
    def __init__(self, params):
        if not os.path.exists(params.data_dir):
            os.mkdir(params.data_dir)

        db2schema, all_schema_tokens, all_schema_tokens_sep = load_db_schema(
            os.path.join(params.raw_data_dir, params.db_schema_filename),
            params.remove_from)

        self.train_data = DatasetSplit(
            os.path.join(params.data_dir, 'train.pkl'),
            os.path.join(params.raw_data_dir, 'train.pkl'),
            db2schema)

        self.valid_data = DatasetSplit(
            os.path.join(params.data_dir, 'dev.pkl'),
            os.path.join(params.raw_data_dir, 'dev.pkl'),
            db2schema)

        all_utter_seqs = self.train_data.get_all_utterances() + self.valid_data.get_all_utterances()
        # all_query_seqs = self.train_data.get_all_queries() + self.valid_data.get_all_queries()

        sql_keywords = ['select', ')', '(', 'value', 'count', 'where', ',', '=', 'group_by', 'order_by', 'limit_value', 'desc', 'distinct', '>', 'avg', 'having', 'and', '<', 'asc', 'in', 'sum', 'max', 'except', 'not', 'intersect', 'or', 'min', 'like', '!=', 'union', 'between', '-', '+']

        # Build vocabularies
        self.schema_vocab = Vocab(all_schema_tokens_sep, data_type='schema')
        self.utter_vocab = Vocab(all_utter_seqs, data_type='utter')
        # skip_tokens = list(set(all_schema_tokens) - set(sql_keywords)) # skip column names
        # self.query_vocab = Vocab(all_query_seqs, data_type='query', skip=skip_tokens)
        self.query_vocab = Vocab([sql_keywords], data_type='query')

        self.train_data.str2index(self.schema_vocab, self.utter_vocab, self.query_vocab)
        self.valid_data.str2index(self.schema_vocab, self.utter_vocab, self.query_vocab)
예제 #2
0
    def __init__(self, opt):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.opt = opt
        self.start_id = self.vocab.word2id(config.START_DECODING)
        self.stop_id = self.vocab.word2id(config.STOP_DECODING)
        self.pad_id = self.vocab.word2id(config.PAD_TOKEN)
        self.unk_id = self.vocab.word2id(config.UNK_TOKEN)

        time.sleep(5)
예제 #3
0
    def __init__(self, opt):
        super(CnnDm, self).__init__()
        self.opt = opt

        with open(opt['vocab_path'], 'r', encoding='utf-8') as file:
            vocab_list = [line.strip() for line in file.readlines()]
        self.vocab = Vocab.from_list(vocab_list)

        train_examples = glob.glob(opt['train_path'] + '/*.json')
        valid_examples = glob.glob(opt['valid_path'] + '/*.json')
        test_examples = glob.glob(opt['test_path'] + '/*.json')
        train_data = []
        valid_data = []
        test_data = []

        print("Loading Training Data")
        for example in tqdm(train_examples):
            with open(example, 'r', encoding='utf-8') as file:
                train_data.append(json.load(file))
        print("Loading Validation Data")
        for example in tqdm(valid_examples):
            with open(example, 'r', encoding='utf-8') as file:
                valid_data.append(json.load(file))
        print("Loading Test Data")
        for example in tqdm(test_examples):
            with open(example, 'r', encoding='utf-8') as file:
                test_data.append(json.load(file))

        print('Loading Word2Vec pretrained vectors')
        self.vectors, _ = make_embedding(
            self.vocab, 'data/word2vec/word2vec.128d.226k.bin')

        self.train_dataset = CnnDmDataset(opt, train_data, self.vocab,
                                          opt['mode'])
        self.valid_dataset = CnnDmDataset(opt, valid_data, self.vocab,
                                          opt['mode'])
        self.test_dataset = CnnDmDataset(opt, test_data, self.vocab, 't')

        self.train_loader = DataLoader(dataset=self.train_dataset,
                                       batch_size=opt['batch_size'],
                                       shuffle=True,
                                       collate_fn=self.train_dataset.collate)
        self.valid_loader = DataLoader(dataset=self.valid_dataset,
                                       batch_size=opt['batch_size'],
                                       shuffle=True,
                                       collate_fn=self.valid_dataset.collate)
        self.test_loader = DataLoader(dataset=self.test_dataset,
                                      batch_size=opt['batch_size'],
                                      shuffle=False,
                                      collate_fn=self.test_dataset.collate)
예제 #4
0
    def train_iter(self):

        iter = self.setup_train()
        count = mle_total = 0

        data_path = self.opt.data_path

        vocab = Vocab(config.vocab_path, config.vocab_size)
        articles, summaries, articles_oovs = tools.load_data(data_path, vocab)
        assert len(articles) == len(summaries) == len(articles_oovs)

        train_data = Batcher(articles, summaries, articles_oovs)

        train_loader = DataLoader(train_data,
                                  batch_size=12,
                                  shuffle=False,
                                  pin_memory=True,
                                  collate_fn=lambda batch: collate_fn(
                                      batch, padding_value=config.PAD_TOKEN),
                                  drop_last=True)

        for input_batch, target_batch, oovs_batch in train_loader:
            if iter > config.max_iterations:
                break

            try:
                mle_loss = self.train_one_batch(input_batch, target_batch,
                                                oovs_batch)
            except KeyboardInterrupt:
                print("----- keyboard interrupt!-----")
                break

            mle_total += mle_loss
            count += 1
            iter += 1

            if iter % 2000 == 0:
                mle_avg = mle_total / count

                print('iter:', iter, 'mle_loss:', "%.3f" % mle_avg)

                count = mle_total = 0
                sys.stdout.flush()

            if iter % 5000 == 0:
                self.save_model(iter)
예제 #5
0
            chars_padded = torch.Tensor(chars_padded).long()
            chars_padded_lens = torch.Tensor(chars_padded_lens).long()
            if self.config.is_cuda:
                chars_padded = chars_padded.cuda()
                chars_padded_lens = chars_padded_lens.cuda()

            all_chars.append((chars_padded, chars_padded_lens))

        for v in features:
            padded, _ = pad_items(prepared_batch[v], (v == 'tags'))
            prepared_batch[v] = padded

        for v in features + ['words_lens']:
            prepared_batch[v] = torch.Tensor(prepared_batch[v]).long()
            if self.config.is_cuda:
                prepared_batch[v] = prepared_batch[v].cuda()

        prepared_batch['chars'] = all_chars
        prepared_batch['raw_sentence'] = raw_sentences

        return prepared_batch


if __name__ == '__main__':
    from config import config

    vocab = Vocab(config)
    train_iter = DatasetConll2003(config.test_file, config, vocab, False)
    batch = next(iter(train_iter))

    print(batch)
 def __init__(self, config, model_file_path):
     self.config = config
     self.vocab = Vocab(config)
     self.model = get_model(self.vocab, config, model_file_path)
예제 #7
0
from data_utils.vocab import Vocab
from data_utils import config
from data_utils.batcher import Batcher, collate_fn
import torch
from torch.utils.data import DataLoader
import logging
import torch.distributed as dist
import argparse
logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':

    data_path = '/home/yuf/pointer_generator_distributed/data/nbc.com.txt'
    print('data_path is: {}'.format(data_path))

    vocab = Vocab(config.vocab_path, config.vocab_size)
    articles, summaries, articles_oovs = tools.load_data(data_path, vocab)
    assert len(articles) == len(summaries) == len(articles_oovs)

    print('total article and summary length: {0}, {1}'.format(
        len(articles), len(summaries)))

    train_data = Batcher(articles, summaries, articles_oovs)

    train_loader = DataLoader(
        train_data,
        batch_size=12,
        shuffle=False,
        pin_memory=True,
        collate_fn=lambda batch: collate_fn(batch, padding_value=0),
        drop_last=True)
예제 #8
0
import torch
from data_utils import config
import numpy as np
from data_utils.vocab import Vocab

start_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.START_DECODING)
stop_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.STOP_DECODING)
pad_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.PAD_TOKEN)


def get_cuda(tensor):

    if torch.cuda.is_available():
        tensor = tensor.cuda()
    return tensor


def get_enc_data(input_batch, oovs_batch):
        """
        input_batch: [batch_size, max_seq_len]. 按照batch中最长的部分进行填充过后batch。 list of list

        return:

        enc_batch: [batch_size, max_seq_len]: 将input_batch转换为torch.tensor的形式。

        enc_lens: [batch_size]. 存储batch中每个example的真实的长度,而不是填充过后的长度。

        enc_padding_mask: [batch_size, max_seq_len]. 输入的mask情况,1代表没有填充,0代表填充。

        ct_e: [batch_size, 2 * hidden_dim].  用来存储encoder的hidden state。
예제 #9
0
class Train(object):
    def __init__(self, opt):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.opt = opt
        self.start_id = self.vocab.word2id(config.START_DECODING)
        self.stop_id = self.vocab.word2id(config.STOP_DECODING)
        self.pad_id = self.vocab.word2id(config.PAD_TOKEN)
        self.unk_id = self.vocab.word2id(config.UNK_TOKEN)

        time.sleep(5)

    def save_model(self, iter):

        saved_path = config.saved_model_path + "%07d.tar" % iter

        torch.save({
            'iter': iter,
            'model_dict': self.model.state_dict(),
            'trainer_dict': self.trainer.state_dict()
        })

    def setup_train(self):

        self.model = Model()
        self.trainer = torch.optim.Adam(self.model.parameters(), lr=config.lr)

        start_iter = 0

        if self.opt.load_model is not None:
            load_model_path = os.path.join(config.saved_model_path,
                                           self.opt.load_model)
            checkpoint = torch.load(load_model_path)

            start_iter = checkpoint['iter']
            self.model.load_state_dict(checkpoint['model_dict'])
            self.trainer.load_state_dict(checkpoint['trainer_dict'])

            logging.debug('load model at:' + load_model_path)

        if self.opt.new_lr is not None:
            self.trainer = torch.optim.Adam(self.model.parameters(),
                                            lr=self.opt.new_lr)

            logging.debug('update the lr to:' + self.opt.new_lr)

        return start_iter

    def train_MLE(self, input_batch, target_batch, oovs_batch):
        '''
        输入:
        input_batch:,oovs_batch,target_batch:
            从 dataloader中得到的tgt_batch. 具体返回值详见 batcher.collect_fn函数。
        '''
        '''
        enc_batch: [batch_size, max_enc_len]
        enc_lens: [batch_size]
        enc_padding_mask: [batch_size, max_enc_len]. 0代表填充。
        ct_e: [batch_size, 2 * hidden_dim]
        extra_zeros: [batch_size, max_art_oovs]
        '''
        enc_batch, enc_lens, enc_padding_mask, ct_e, extra_zeros = get_enc_data(
            input_batch, oovs_batch)
        enc_batch = self.model.embeds(enc_batch)
        '''
        enc_out: [batch_size, max_seq_len, 2 * hidden_size]
        enc_hidden: (h, c). [batch_size, hidden_dim]
        '''
        enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)
        '''
        dec_batch: [batch_size, max_dec_len]
        max_dec_len: int
        dec_lens: [batch_size]
        target_batch: [batch_size, max_dec_len]
        '''
        dec_batch, max_dec_len, dec_lens, target_batch = get_dec_data(
            target_batch)

        step_losses = []

        h_t = (enc_hidden[0], enc_hidden[1])

        x_t = torch.LongTensor(len(enc_out)).fill_(self.start_id)

        prev_s = None
        sum_temporal_srcs = None

        for t in range(min(max_dec_len, config.max_dec_steps)):
            # use_ground_truth: [batch_size]. 0/1
            use_ground_truth = torch.rand(len(enc_out) > 0.25).long()
            x_t = use_ground_truth * dec_batch[:, t] + (1 -
                                                        use_ground_truth) * x_t

            x_t = self.model.embeds(x_t)
            '''
            final_dist:       [batch_size, config.vocab_size + batch.max_art_oovs]  当前计算得到
            h_t: (h, c):       [batch_size, hidden_dim]       当前部分LSTMCell的输出得到
            ct_e:              [batch_size, 2 * hidden_dim]。 encoder_attention得到,当前的time step对encoder进行注意力计算得到的结果。
            sum_temporal_srcs: [batch_size, max_seq_len]      encoder_attention得到
            prev_s:            [batch_size, t,  hidden_dim]   decoder_attention得到
            '''
            final_dist, h_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder(
                x_t, h_t, enc_out, enc_padding_mask, ct_e, extra_zeros,
                enc_batch, sum_temporal_srcs, prev_s)
            target = target_batch[:, t]
            log_probs = torch.log(final_dist + config.eps)
            nll_loss = F.nll_loss(log_probs,
                                  target,
                                  reduction='none',
                                  ignore_index=self.pad_id)

            step_losses.append(nll_loss)

            # final_dist: [batch_size, config.vocab_size + max_art_oovs]
            # --> x_t: [batch_size, 1]
            # --> x_t: [batch_size]
            x_t = torch.multinomial(final_dist, 1).squeeze()
            is_oovs = (x_t > config.vocab_size).long()
            x_t = (1 - is_oovs) * x_t.detach() + is_oovs * self.unk_id

        # step_losses: list of tensor
        # torch.stack --> [batch_size]
        # torch.sum   --> [1]
        losses = torch.sum(torch.stack(step_losses, dim=1), dim=1)

        batch_avg_loss = losses / dec_lens
        mle_loss = torch.mean(batch_avg_loss)

        return mle_loss

    def train_one_batch(self, input_batch, target_batch, oovs_batch):

        mle_loss = self.train_MLE(input_batch, target_batch, oovs_batch)

        self.trainer.zero_grad()
        mle_loss.backward()
        self.trainer.step()

        return mle_loss.item()

    def train_iter(self):

        iter = self.setup_train()
        count = mle_total = 0

        data_path = self.opt.data_path

        vocab = Vocab(config.vocab_path, config.vocab_size)
        articles, summaries, articles_oovs = tools.load_data(data_path, vocab)
        assert len(articles) == len(summaries) == len(articles_oovs)

        train_data = Batcher(articles, summaries, articles_oovs)

        train_loader = DataLoader(train_data,
                                  batch_size=12,
                                  shuffle=False,
                                  pin_memory=True,
                                  collate_fn=lambda batch: collate_fn(
                                      batch, padding_value=config.PAD_TOKEN),
                                  drop_last=True)

        for input_batch, target_batch, oovs_batch in train_loader:
            if iter > config.max_iterations:
                break

            try:
                mle_loss = self.train_one_batch(input_batch, target_batch,
                                                oovs_batch)
            except KeyboardInterrupt:
                print("----- keyboard interrupt!-----")
                break

            mle_total += mle_loss
            count += 1
            iter += 1

            if iter % 2000 == 0:
                mle_avg = mle_total / count

                print('iter:', iter, 'mle_loss:', "%.3f" % mle_avg)

                count = mle_total = 0
                sys.stdout.flush()

            if iter % 5000 == 0:
                self.save_model(iter)