def build_vocab(path, cutoff=0): # Requires a large amount of memeory, but only need to build once if os.path.isdir(path): # here we need some trick to deal with excessively large files # for each file we accumulate the counter of characters, and # at the end we simply pass a list of chars to the vocab builder counter = Counter() filenames = sorted(os.listdir(path)) for filename in filenames: lines = open(path + '/' + filename).readlines() for line in lines: counter.update(list(line)) # remove infrequent characters from vocab for k in list(counter.keys()): if counter[k] < cutoff: del counter[k] # a singleton list of all characters data = [sorted([x[0] for x in counter.most_common()])] vocab = CharVocab( data) # skip cutoff argument because this has been dealt with else: lines = open(path).readlines() # reserve '\n' data = [list(line) for line in lines] vocab = CharVocab(data, cutoff=cutoff) return vocab
def init_vocab(self, data): assert self.eval == False # for eval vocab must exist charvocab = CharVocab(data, self.args['shorthand']) wordvocab = WordVocab(data, self.args['shorthand'], cutoff=self.cutoff, lower=True) uposvocab = WordVocab(data, self.args['shorthand'], idx=1) xposvocab = xpos_vocab_factory(data, self.args['shorthand']) featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3) lemmavocab = WordVocab(data, self.args['shorthand'], cutoff=self.cutoff, idx=4, lower=True) deprelvocab = WordVocab(data, self.args['shorthand'], idx=6) vocab = MultiVocab({ 'char': charvocab, 'word': wordvocab, 'upos': uposvocab, 'xpos': xposvocab, 'feats': featsvocab, 'lemma': lemmavocab, 'deprel': deprelvocab }) return vocab
def load(cls, filename, finetune=False): state = torch.load(filename, lambda storage, loc: storage) vocab = {'char': CharVocab.load_state_dict(state['vocab'])} model = cls(state['args'], vocab, state['pad'], state['is_forward_lm']) model.load_state_dict(state['state_dict']) model.eval() model.finetune = finetune # set finetune status return model
def init_vocab(self, data): def from_model(model_filename): """ Try loading vocab from charLM model file. """ state_dict = torch.load(model_filename, lambda storage, loc: storage) assert 'vocab' in state_dict, "Cannot find vocab in charLM model file." return state_dict['vocab'] if self.eval: raise Exception("Vocab must exist for evaluation.") if self.args['charlm']: charvocab = CharVocab.load_state_dict( from_model(self.args['charlm_forward_file'])) else: charvocab = CharVocab(data, self.args['shorthand']) wordvocab = self.pretrain.vocab tagvocab = TagVocab(data, self.args['shorthand'], idx=1) vocab = MultiVocab({ 'char': charvocab, 'word': wordvocab, 'tag': tagvocab }) return vocab
def init_vocab(self, data_list): assert self.eval == False # for eval vocab must exist data_all = sum(data_list, []) charvocab = CharVocab(data_all, self.args['shorthand']) # construct wordvocab from multiple files wordvocabs = [WordVocab(data, self.args['shorthand'], cutoff=0, lower=True) for data in data_list] wordset = list(set(sum([v._id2unit[len(VOCAB_PREFIX):len(VOCAB_PREFIX) + self.args['vocab_cutoff']] for v in wordvocabs], []))) wordvocab = wordvocabs[0] wordvocab._id2unit = VOCAB_PREFIX + wordset wordvocab._unit2id = {w: i for i, w in enumerate(wordvocab._id2unit)} print('Constructing a joint word vocabulary of size {} ...'.format(len(wordvocab))) uposvocab = WordVocab(data_all, self.args['shorthand'], idx=1) xposvocab = xpos_vocab_factory(data_all, self.args['shorthand']) featsvocab = FeatureVocab(data_all, self.args['shorthand'], idx=3) lemmavocab = WordVocab(data_all, self.args['shorthand'], cutoff=self.cutoff, idx=4, lower=True) vocab = MultiVocab({'char': charvocab, 'word': wordvocab, 'upos': uposvocab, 'xpos': xposvocab, 'feats': featsvocab, 'lemma': lemmavocab, }) return vocab
def train(args): model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction']) vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \ else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand']) if os.path.exists(vocab_file): logging.info('Loading existing vocab file') vocab = { 'char': CharVocab.load_state_dict( torch.load(vocab_file, lambda storage, loc: storage)) } else: logging.info('Building and saving vocab') vocab = { 'char': build_vocab(args['train_file'] if args['train_dir'] is None else args['train_dir'], cutoff=args['cutoff']) } torch.save(vocab['char'].state_dict(), vocab_file) print("Training model with vocab size: {}".format(len(vocab['char']))) model = CharacterLanguageModel( args, vocab, is_forward_lm=True if args['direction'] == 'forward' else False) if args['cuda']: model = model.cuda() params = [param for param in model.parameters() if param.requires_grad] optimizer = torch.optim.SGD(params, lr=args['lr0'], momentum=args['momentum'], weight_decay=args['weight_decay']) criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, verbose=True, factor=args['anneal'], patience=args['patience']) best_loss = None for epoch in range(args['epochs']): # load train data from train_dir if not empty, otherwise load from file if args['train_dir'] is not None: train_path = args['train_dir'] else: train_path = args['train_file'] train_data = load_data(train_path, vocab, args['direction']) dev_data = load_data(args['eval_file'], vocab, args['direction']) train_epoch(args, vocab, train_data, model, params, optimizer, criterion, epoch + 1) start_time = time.time() loss = evaluate_epoch(args, vocab, dev_data, model, criterion) elapsed = int(time.time() - start_time) scheduler.step(loss) print( "| {:5d}/{:5d} epochs | time elapsed {:6d}s | loss {:5.2f} | ppl {:8.2f}" .format( epoch + 1, args['epochs'], elapsed, loss, math.exp(loss), )) if best_loss is None or loss < best_loss: best_loss = loss model.save(model_file) print('new best model saved.') return