def build_vocab(path, cutoff=0): # Requires a large amount of memeory, but only need to build once if os.path.isdir(path): # here we need some trick to deal with excessively large files # for each file we accumulate the counter of characters, and # at the end we simply pass a list of chars to the vocab builder counter = Counter() filenames = sorted(os.listdir(path)) for filename in filenames: lines = open(path + '/' + filename).readlines() for line in lines: counter.update(list(line)) # remove infrequent characters from vocab for k in list(counter.keys()): if counter[k] < cutoff: del counter[k] # a singleton list of all characters data = [sorted([x[0] for x in counter.most_common()])] vocab = CharVocab( data) # skip cutoff argument because this has been dealt with else: lines = open(path).readlines() # reserve '\n' data = [list(line) for line in lines] vocab = CharVocab(data, cutoff=cutoff) return vocab
def init_vocab(self, data): assert self.eval == False # for eval vocab must exist charvocab = CharVocab(data, self.args['shorthand']) wordvocab = WordVocab(data, self.args['shorthand'], cutoff=7, lower=True) uposvocab = WordVocab(data, self.args['shorthand'], idx=1) xposvocab = xpos_vocab_factory(data, self.args['shorthand']) featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3) lemmavocab = WordVocab(data, self.args['shorthand'], cutoff=7, idx=4, lower=True) deprelvocab = WordVocab(data, self.args['shorthand'], idx=6) vocab = MultiVocab({ 'char': charvocab, 'word': wordvocab, 'upos': uposvocab, 'xpos': xposvocab, 'feats': featsvocab, 'lemma': lemmavocab, 'deprel': deprelvocab }) return vocab
def load(cls, filename, finetune=False): state = torch.load(filename, lambda storage, loc: storage) vocab = {'char': CharVocab.load_state_dict(state['vocab'])} model = cls(state['args'], vocab, state['pad'], state['is_forward_lm']) model.load_state_dict(state['state_dict']) model.eval() model.finetune = finetune # set finetune status return model
def init_vocab(self, data): def from_model(model_filename): """ Try loading vocab from charLM model file. """ state_dict = torch.load(model_filename, lambda storage, loc: storage) assert 'vocab' in state_dict, "Cannot find vocab in charLM model file." return state_dict['vocab'] if self.eval: raise Exception("Vocab must exist for evaluation.") if self.args['charlm']: charvocab = CharVocab.load_state_dict(from_model(self.args['charlm_forward_file'])) else: charvocab = CharVocab(data, self.args['shorthand']) wordvocab = self.pretrain.vocab tagvocab = TagVocab(data, self.args['shorthand'], idx=1) vocab = MultiVocab({'char': charvocab, 'word': wordvocab, 'tag': tagvocab}) return vocab
def train(args): model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction']) vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \ else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand']) if os.path.exists(vocab_file): logging.info('Loading existing vocab file') vocab = { 'char': CharVocab.load_state_dict( torch.load(vocab_file, lambda storage, loc: storage)) } else: logging.info('Building and saving vocab') vocab = { 'char': build_vocab(args['train_file'] if args['train_dir'] is None else args['train_dir'], cutoff=args['cutoff']) } torch.save(vocab['char'].state_dict(), vocab_file) logger.info("Training model with vocab size: {}".format(len( vocab['char']))) model = CharacterLanguageModel( args, vocab, is_forward_lm=True if args['direction'] == 'forward' else False) if args['cuda']: model = model.cuda() params = [param for param in model.parameters() if param.requires_grad] optimizer = torch.optim.SGD(params, lr=args['lr0'], momentum=args['momentum'], weight_decay=args['weight_decay']) criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, verbose=True, factor=args['anneal'], patience=args['patience']) writer = None if args['summary']: from torch.utils.tensorboard import SummaryWriter summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction']) writer = SummaryWriter(log_dir=summary_dir) best_loss = None for epoch in range(args['epochs']): # load train data from train_dir if not empty, otherwise load from file if args['train_dir'] is not None: train_path = args['train_dir'] else: train_path = args['train_file'] train_data = load_data(train_path, vocab, args['direction']) dev_data = load_data(args['eval_file'], vocab, args['direction']) train_epoch(args, vocab, train_data, model, params, optimizer, criterion, epoch + 1) start_time = time.time() loss = evaluate_epoch(args, vocab, dev_data, model, criterion) ppl = math.exp(loss) elapsed = int(time.time() - start_time) scheduler.step(loss) logger.info( "| {:5d}/{:5d} epochs | time elapsed {:6d}s | loss {:5.2f} | ppl {:8.2f}" .format( epoch + 1, args['epochs'], elapsed, loss, ppl, )) if best_loss is None or loss < best_loss: best_loss = loss model.save(model_file) logger.info('new best model saved.') if writer: writer.add_scalar('dev_loss', loss, global_step=epoch + 1) writer.add_scalar('dev_ppl', ppl, global_step=epoch + 1) if writer: writer.close() return
def train(args): model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction']) vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \ else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand']) if os.path.exists(vocab_file): logger.info('Loading existing vocab file') vocab = { 'char': CharVocab.load_state_dict( torch.load(vocab_file, lambda storage, loc: storage)) } else: logger.info('Building and saving vocab') vocab = { 'char': build_vocab(args['train_file'] if args['train_dir'] is None else args['train_dir'], cutoff=args['cutoff']) } torch.save(vocab['char'].state_dict(), vocab_file) logger.info("Training model with vocab size: {}".format(len( vocab['char']))) model = CharacterLanguageModel( args, vocab, is_forward_lm=True if args['direction'] == 'forward' else False) if args['cuda']: model = model.cuda() params = [param for param in model.parameters() if param.requires_grad] optimizer = torch.optim.SGD(params, lr=args['lr0'], momentum=args['momentum'], weight_decay=args['weight_decay']) criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, verbose=True, factor=args['anneal'], patience=args['patience']) writer = None if args['summary']: from torch.utils.tensorboard import SummaryWriter summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction']) writer = SummaryWriter(log_dir=summary_dir) # evaluate model within epoch if eval_interval is set eval_within_epoch = False if args['eval_steps'] > 0: eval_within_epoch = True best_loss = None global_step = 0 for epoch in range(1, args['epochs'] + 1): # load train data from train_dir if not empty, otherwise load from file if args['train_dir'] is not None: train_path = args['train_dir'] else: train_path = args['train_file'] train_data = load_data(train_path, vocab, args['direction']) dev_data = load_file(args['eval_file'], vocab, args['direction']) # dev must be a single file # run over entire training set for data_chunk in train_data: batches = batchify(data_chunk, args['batch_size']) hidden = None total_loss = 0.0 total_batches = math.ceil( (batches.size(1) - 1) / args['bptt_size']) iteration, i = 0, 0 # over the data chunk while i < batches.size(1) - 1 - 1: model.train() global_step += 1 start_time = time.time() bptt = args['bptt_size'] if np.random.random( ) < 0.95 else args['bptt_size'] / 2. # prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # prevent very large sequence length, must be <= 1.2 x bptt seq_len = min(seq_len, int(args['bptt_size'] * 1.2)) data, target = get_batch(batches, i, seq_len) lens = [data.size(1) for i in range(data.size(0))] if args['cuda']: data = data.cuda() target = target.cuda() optimizer.zero_grad() output, hidden, decoded = model.forward(data, lens, hidden) loss = criterion(decoded.view(-1, len(vocab['char'])), target) total_loss += loss.data.item() loss.backward() torch.nn.utils.clip_grad_norm_(params, args['max_grad_norm']) optimizer.step() hidden = repackage_hidden(hidden) if (iteration + 1) % args['report_steps'] == 0: cur_loss = total_loss / args['report_steps'] elapsed = time.time() - start_time logger.info( "| epoch {:5d} | {:5d}/{:5d} batches | sec/batch {:.6f} | loss {:5.2f} | ppl {:8.2f}" .format( epoch, iteration + 1, total_batches, elapsed / args['report_steps'], cur_loss, math.exp(cur_loss), )) total_loss = 0.0 iteration += 1 i += seq_len # evaluate if necessary if eval_within_epoch and global_step % args['eval_steps'] == 0: _, _, best_loss = evaluate_and_save(args, vocab, dev_data, model, criterion, scheduler, best_loss, \ global_step, model_file, writer) # if eval_interval isn't provided, run evaluation after each epoch if not eval_within_epoch: _, _, best_loss = evaluate_and_save(args, vocab, dev_data, model, criterion, scheduler, best_loss, \ epoch, model_file, writer) # use epoch in place of global_step for logging if writer: writer.close() return