def __init__(self, params): if not os.path.exists(params.data_dir): os.mkdir(params.data_dir) db2schema, all_schema_tokens, all_schema_tokens_sep = load_db_schema( os.path.join(params.raw_data_dir, params.db_schema_filename), params.remove_from) self.train_data = DatasetSplit( os.path.join(params.data_dir, 'train.pkl'), os.path.join(params.raw_data_dir, 'train.pkl'), db2schema) self.valid_data = DatasetSplit( os.path.join(params.data_dir, 'dev.pkl'), os.path.join(params.raw_data_dir, 'dev.pkl'), db2schema) all_utter_seqs = self.train_data.get_all_utterances() + self.valid_data.get_all_utterances() # all_query_seqs = self.train_data.get_all_queries() + self.valid_data.get_all_queries() sql_keywords = ['select', ')', '(', 'value', 'count', 'where', ',', '=', 'group_by', 'order_by', 'limit_value', 'desc', 'distinct', '>', 'avg', 'having', 'and', '<', 'asc', 'in', 'sum', 'max', 'except', 'not', 'intersect', 'or', 'min', 'like', '!=', 'union', 'between', '-', '+'] # Build vocabularies self.schema_vocab = Vocab(all_schema_tokens_sep, data_type='schema') self.utter_vocab = Vocab(all_utter_seqs, data_type='utter') # skip_tokens = list(set(all_schema_tokens) - set(sql_keywords)) # skip column names # self.query_vocab = Vocab(all_query_seqs, data_type='query', skip=skip_tokens) self.query_vocab = Vocab([sql_keywords], data_type='query') self.train_data.str2index(self.schema_vocab, self.utter_vocab, self.query_vocab) self.valid_data.str2index(self.schema_vocab, self.utter_vocab, self.query_vocab)
def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.opt = opt self.start_id = self.vocab.word2id(config.START_DECODING) self.stop_id = self.vocab.word2id(config.STOP_DECODING) self.pad_id = self.vocab.word2id(config.PAD_TOKEN) self.unk_id = self.vocab.word2id(config.UNK_TOKEN) time.sleep(5)
def __init__(self, opt): super(CnnDm, self).__init__() self.opt = opt with open(opt['vocab_path'], 'r', encoding='utf-8') as file: vocab_list = [line.strip() for line in file.readlines()] self.vocab = Vocab.from_list(vocab_list) train_examples = glob.glob(opt['train_path'] + '/*.json') valid_examples = glob.glob(opt['valid_path'] + '/*.json') test_examples = glob.glob(opt['test_path'] + '/*.json') train_data = [] valid_data = [] test_data = [] print("Loading Training Data") for example in tqdm(train_examples): with open(example, 'r', encoding='utf-8') as file: train_data.append(json.load(file)) print("Loading Validation Data") for example in tqdm(valid_examples): with open(example, 'r', encoding='utf-8') as file: valid_data.append(json.load(file)) print("Loading Test Data") for example in tqdm(test_examples): with open(example, 'r', encoding='utf-8') as file: test_data.append(json.load(file)) print('Loading Word2Vec pretrained vectors') self.vectors, _ = make_embedding( self.vocab, 'data/word2vec/word2vec.128d.226k.bin') self.train_dataset = CnnDmDataset(opt, train_data, self.vocab, opt['mode']) self.valid_dataset = CnnDmDataset(opt, valid_data, self.vocab, opt['mode']) self.test_dataset = CnnDmDataset(opt, test_data, self.vocab, 't') self.train_loader = DataLoader(dataset=self.train_dataset, batch_size=opt['batch_size'], shuffle=True, collate_fn=self.train_dataset.collate) self.valid_loader = DataLoader(dataset=self.valid_dataset, batch_size=opt['batch_size'], shuffle=True, collate_fn=self.valid_dataset.collate) self.test_loader = DataLoader(dataset=self.test_dataset, batch_size=opt['batch_size'], shuffle=False, collate_fn=self.test_dataset.collate)
def train_iter(self): iter = self.setup_train() count = mle_total = 0 data_path = self.opt.data_path vocab = Vocab(config.vocab_path, config.vocab_size) articles, summaries, articles_oovs = tools.load_data(data_path, vocab) assert len(articles) == len(summaries) == len(articles_oovs) train_data = Batcher(articles, summaries, articles_oovs) train_loader = DataLoader(train_data, batch_size=12, shuffle=False, pin_memory=True, collate_fn=lambda batch: collate_fn( batch, padding_value=config.PAD_TOKEN), drop_last=True) for input_batch, target_batch, oovs_batch in train_loader: if iter > config.max_iterations: break try: mle_loss = self.train_one_batch(input_batch, target_batch, oovs_batch) except KeyboardInterrupt: print("----- keyboard interrupt!-----") break mle_total += mle_loss count += 1 iter += 1 if iter % 2000 == 0: mle_avg = mle_total / count print('iter:', iter, 'mle_loss:', "%.3f" % mle_avg) count = mle_total = 0 sys.stdout.flush() if iter % 5000 == 0: self.save_model(iter)
chars_padded = torch.Tensor(chars_padded).long() chars_padded_lens = torch.Tensor(chars_padded_lens).long() if self.config.is_cuda: chars_padded = chars_padded.cuda() chars_padded_lens = chars_padded_lens.cuda() all_chars.append((chars_padded, chars_padded_lens)) for v in features: padded, _ = pad_items(prepared_batch[v], (v == 'tags')) prepared_batch[v] = padded for v in features + ['words_lens']: prepared_batch[v] = torch.Tensor(prepared_batch[v]).long() if self.config.is_cuda: prepared_batch[v] = prepared_batch[v].cuda() prepared_batch['chars'] = all_chars prepared_batch['raw_sentence'] = raw_sentences return prepared_batch if __name__ == '__main__': from config import config vocab = Vocab(config) train_iter = DatasetConll2003(config.test_file, config, vocab, False) batch = next(iter(train_iter)) print(batch)
def __init__(self, config, model_file_path): self.config = config self.vocab = Vocab(config) self.model = get_model(self.vocab, config, model_file_path)
from data_utils.vocab import Vocab from data_utils import config from data_utils.batcher import Batcher, collate_fn import torch from torch.utils.data import DataLoader import logging import torch.distributed as dist import argparse logging.basicConfig(level=logging.INFO) if __name__ == '__main__': data_path = '/home/yuf/pointer_generator_distributed/data/nbc.com.txt' print('data_path is: {}'.format(data_path)) vocab = Vocab(config.vocab_path, config.vocab_size) articles, summaries, articles_oovs = tools.load_data(data_path, vocab) assert len(articles) == len(summaries) == len(articles_oovs) print('total article and summary length: {0}, {1}'.format( len(articles), len(summaries))) train_data = Batcher(articles, summaries, articles_oovs) train_loader = DataLoader( train_data, batch_size=12, shuffle=False, pin_memory=True, collate_fn=lambda batch: collate_fn(batch, padding_value=0), drop_last=True)
import torch from data_utils import config import numpy as np from data_utils.vocab import Vocab start_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.START_DECODING) stop_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.STOP_DECODING) pad_id = Vocab(config.vocab_path, config.vocab_size).word2id(config.PAD_TOKEN) def get_cuda(tensor): if torch.cuda.is_available(): tensor = tensor.cuda() return tensor def get_enc_data(input_batch, oovs_batch): """ input_batch: [batch_size, max_seq_len]. 按照batch中最长的部分进行填充过后batch。 list of list return: enc_batch: [batch_size, max_seq_len]: 将input_batch转换为torch.tensor的形式。 enc_lens: [batch_size]. 存储batch中每个example的真实的长度,而不是填充过后的长度。 enc_padding_mask: [batch_size, max_seq_len]. 输入的mask情况,1代表没有填充,0代表填充。 ct_e: [batch_size, 2 * hidden_dim]. 用来存储encoder的hidden state。
class Train(object): def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.opt = opt self.start_id = self.vocab.word2id(config.START_DECODING) self.stop_id = self.vocab.word2id(config.STOP_DECODING) self.pad_id = self.vocab.word2id(config.PAD_TOKEN) self.unk_id = self.vocab.word2id(config.UNK_TOKEN) time.sleep(5) def save_model(self, iter): saved_path = config.saved_model_path + "%07d.tar" % iter torch.save({ 'iter': iter, 'model_dict': self.model.state_dict(), 'trainer_dict': self.trainer.state_dict() }) def setup_train(self): self.model = Model() self.trainer = torch.optim.Adam(self.model.parameters(), lr=config.lr) start_iter = 0 if self.opt.load_model is not None: load_model_path = os.path.join(config.saved_model_path, self.opt.load_model) checkpoint = torch.load(load_model_path) start_iter = checkpoint['iter'] self.model.load_state_dict(checkpoint['model_dict']) self.trainer.load_state_dict(checkpoint['trainer_dict']) logging.debug('load model at:' + load_model_path) if self.opt.new_lr is not None: self.trainer = torch.optim.Adam(self.model.parameters(), lr=self.opt.new_lr) logging.debug('update the lr to:' + self.opt.new_lr) return start_iter def train_MLE(self, input_batch, target_batch, oovs_batch): ''' 输入: input_batch:,oovs_batch,target_batch: 从 dataloader中得到的tgt_batch. 具体返回值详见 batcher.collect_fn函数。 ''' ''' enc_batch: [batch_size, max_enc_len] enc_lens: [batch_size] enc_padding_mask: [batch_size, max_enc_len]. 0代表填充。 ct_e: [batch_size, 2 * hidden_dim] extra_zeros: [batch_size, max_art_oovs] ''' enc_batch, enc_lens, enc_padding_mask, ct_e, extra_zeros = get_enc_data( input_batch, oovs_batch) enc_batch = self.model.embeds(enc_batch) ''' enc_out: [batch_size, max_seq_len, 2 * hidden_size] enc_hidden: (h, c). [batch_size, hidden_dim] ''' enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) ''' dec_batch: [batch_size, max_dec_len] max_dec_len: int dec_lens: [batch_size] target_batch: [batch_size, max_dec_len] ''' dec_batch, max_dec_len, dec_lens, target_batch = get_dec_data( target_batch) step_losses = [] h_t = (enc_hidden[0], enc_hidden[1]) x_t = torch.LongTensor(len(enc_out)).fill_(self.start_id) prev_s = None sum_temporal_srcs = None for t in range(min(max_dec_len, config.max_dec_steps)): # use_ground_truth: [batch_size]. 0/1 use_ground_truth = torch.rand(len(enc_out) > 0.25).long() x_t = use_ground_truth * dec_batch[:, t] + (1 - use_ground_truth) * x_t x_t = self.model.embeds(x_t) ''' final_dist: [batch_size, config.vocab_size + batch.max_art_oovs] 当前计算得到 h_t: (h, c): [batch_size, hidden_dim] 当前部分LSTMCell的输出得到 ct_e: [batch_size, 2 * hidden_dim]。 encoder_attention得到,当前的time step对encoder进行注意力计算得到的结果。 sum_temporal_srcs: [batch_size, max_seq_len] encoder_attention得到 prev_s: [batch_size, t, hidden_dim] decoder_attention得到 ''' final_dist, h_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder( x_t, h_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch, sum_temporal_srcs, prev_s) target = target_batch[:, t] log_probs = torch.log(final_dist + config.eps) nll_loss = F.nll_loss(log_probs, target, reduction='none', ignore_index=self.pad_id) step_losses.append(nll_loss) # final_dist: [batch_size, config.vocab_size + max_art_oovs] # --> x_t: [batch_size, 1] # --> x_t: [batch_size] x_t = torch.multinomial(final_dist, 1).squeeze() is_oovs = (x_t > config.vocab_size).long() x_t = (1 - is_oovs) * x_t.detach() + is_oovs * self.unk_id # step_losses: list of tensor # torch.stack --> [batch_size] # torch.sum --> [1] losses = torch.sum(torch.stack(step_losses, dim=1), dim=1) batch_avg_loss = losses / dec_lens mle_loss = torch.mean(batch_avg_loss) return mle_loss def train_one_batch(self, input_batch, target_batch, oovs_batch): mle_loss = self.train_MLE(input_batch, target_batch, oovs_batch) self.trainer.zero_grad() mle_loss.backward() self.trainer.step() return mle_loss.item() def train_iter(self): iter = self.setup_train() count = mle_total = 0 data_path = self.opt.data_path vocab = Vocab(config.vocab_path, config.vocab_size) articles, summaries, articles_oovs = tools.load_data(data_path, vocab) assert len(articles) == len(summaries) == len(articles_oovs) train_data = Batcher(articles, summaries, articles_oovs) train_loader = DataLoader(train_data, batch_size=12, shuffle=False, pin_memory=True, collate_fn=lambda batch: collate_fn( batch, padding_value=config.PAD_TOKEN), drop_last=True) for input_batch, target_batch, oovs_batch in train_loader: if iter > config.max_iterations: break try: mle_loss = self.train_one_batch(input_batch, target_batch, oovs_batch) except KeyboardInterrupt: print("----- keyboard interrupt!-----") break mle_total += mle_loss count += 1 iter += 1 if iter % 2000 == 0: mle_avg = mle_total / count print('iter:', iter, 'mle_loss:', "%.3f" % mle_avg) count = mle_total = 0 sys.stdout.flush() if iter % 5000 == 0: self.save_model(iter)