def __init__(self, model, criterion, optimizer, print_every, cuda=True): self.cuda = cuda self.model = model self.criterion = criterion self.optimizer = optimizer self.print_every = print_every self.start_epoch = 1 self.logger = make_logger('log.train')
def __init__(self, model, reward_type, ce_criterion, optimizer, print_every, cuda=True): self.cuda = cuda self.model = model self.train_criterion = RLCriterion(reward_type, ce_criterion) self.valid_criterion = ce_criterion self.optimizer = optimizer self.print_every = print_every self.start_epoch = 1 self.logger = make_logger('log.train')
def main(): logger = make_logger('log.preprocess') # parse arguments opt = parse_args() dir_name, file_name = os.path.split(opt.save_data) if not os.path.exists(dir_name): os.mkdir(dir_name) # build and save vocab print("Building vocab...") src_word2idx = build_vocab(opt.train_src, opt.src_vocab_size) tgt_word2idx = build_vocab(opt.train_tgt, opt.src_vocab_size) vocab = {'src': src_word2idx, 'tgt': tgt_word2idx} logger.info('Src vocab size: {}'.format(len(src_word2idx))) logger.info('Tgt vocab size: {}'.format(len(tgt_word2idx))) torch.save(vocab, opt.save_data+'.vocab.pt') # convert train text to ids print("Converting train text to ids...") train_src = convert_file_to_ids(opt.train_src, src_word2idx) train_tgt = convert_file_to_ids(opt.train_tgt, tgt_word2idx) print(len(train_src), len(train_tgt)) assert len(train_src) == len(train_tgt) train = list(zip(train_src, train_tgt)) logger.info("Train total lines: {}".format(len(train))) train = [t for t in train if len(t[0]) <= opt.src_seq_length and len(t[1]) <= opt.tgt_seq_length] logger.info("Train after filtering (src: {}, tgt: {}): {}".format(opt.src_seq_length, opt.tgt_seq_length, len(train))) torch.save(train, opt.save_data+'.train.pt') # convert dev text to ids print("Converting valid text to ids...") valid_src = convert_file_to_ids(opt.valid_src, src_word2idx) valid_tgt = convert_file_to_ids(opt.valid_tgt, tgt_word2idx) assert len(valid_src) == len(valid_tgt) valid = list(zip(valid_src, valid_tgt)) logger.info("Valid total lines: {}".format(len(valid))) torch.save(valid, opt.save_data+'.valid.pt')
def __init__(self, model, sample_type, reward_type, ce_criterion, optimizer, print_every, cuda=True): self.cuda = cuda self.model = model self.reward_type = reward_type self.train_criterion = RLCriterion(reward_type, ce_criterion) self.valid_criterion = ce_criterion self.optimizer = optimizer self.print_every = print_every self.start_epoch = 1 self.logger = make_logger('log.train') if sample_type == 'sample': self.sample_func = sample_on_batch elif sample_type == 'beam': self.sample_func = beam_search_on_batch else: raise Exception("Undifined sample type.")