def build_model(): """Build the model according to CLI arguments Global Dependencies: - corpus - args """ # noise for soise sampling in NCE noise = build_unigram_noise(torch.FloatTensor(corpus.vocab.idx2count)) # setting up NCELoss modules if args.index_module == 'linear': criterion = IndexLinear( args.nhid, ntoken, noise=noise, noise_ratio=args.noise_ratio, norm_term=args.norm_term, loss_type=args.loss, reduction='none', ) model = RNNModel( ntoken, args.emsize, args.nhid, args.nlayers, criterion=criterion, dropout=args.dropout, ) elif args.index_module == 'gru': if args.nlayers != 1: logger.warning( 'Falling into one layer GRU due to Index_GRU supporting') nce_criterion = IndexGRU( ntoken, args.nhid, args.nhid, args.dropout, noise=noise, noise_ratio=args.noise_ratio, norm_term=args.norm_term, ) model = GenModel(criterion=nce_criterion, ) else: logger.error('The index module [%s] is not supported yet' % args.index_module) raise (NotImplementedError('index module not supported')) if args.cuda: model.cuda() logger.info('model definition:\n %s', model) return model
def main(): args = parse_arguments() use_cuda = torch.cuda.is_available() print("[!] preparing dataset...") TEXT = data.Field() train_data, val_data, test_data = datasets.WikiText2.splits(TEXT) TEXT.build_vocab(train_data, min_freq=10) train_iter, val_iter, test_iter = data.BPTTIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, bptt_len=30, repeat=False) vocab_size = len(TEXT.vocab) print("[TRAIN]:%d\t[VALID]:%d\t[TEST]:%d\t[VOCAB]%d" % (len(train_iter), len(val_iter), len(test_iter), vocab_size)) print("[!] Instantiating models...") model = RNNModel('LSTM', ntoken=vocab_size, ninp=600, nhid=600, nlayers=2, dropout=0.5) optimizer = optim.Adam(model.parameters(), lr=args.lr) if use_cuda: model.cuda() print(model) best_val_loss = None for e in range(1, args.epochs + 1): train(model, optimizer, train_iter, vocab_size, args.grad_clip, args.log_interval, use_cuda) val_loss = evaluate(model, val_iter, vocab_size, use_cuda) print("[Epoch: %d] val-loss:%5.2f | val-pp:%5.2f" % (e, val_loss, math.exp(val_loss))) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: print("[!] saving model") if not os.path.isdir(args.save): os.makedirs(args.save) torch.save(model, './%s/lm_%d.pt' % (args.save, e)) best_val_loss = val_loss test_loss = evaluate(model, test_iter, vocab_size, use_cuda) print("[Epoch: %d] test-loss:%5.2f | test-pp:%5.2f" % (e, test_loss, math.exp(test_loss)))
def build_model(args, ntokens: int): """ Returns model and loss function. """ print('INFO: Building model') model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.cuda: print('INFO: Moving model to GPU') model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('INFO: Model total parameters:', total_params) criterion = nn.CrossEntropyLoss() return model, criterion
def build_model(args, corpus): criterion = None ntokens = len(corpus.dictionary) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### if args.resume: logging.info('Resuming model ...') model, criterion, optimizer = model_load(args.resume_path) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute if args.wdrop: from weight_drop import WeightDrop for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] logging.info(f'Using {splits}') criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) logging.info(f'Args: {args}') logging.info(f'Model total parameters: {total_params}') if args.cuda: model = model.cuda() criterion = criterion.cuda() return model, criterion
ntokens = len(corpus.dictionary) cutoff = [2000] model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.rnn_dropout, args.output_dropout, args.tied, adasoft=args.adasoft, cutoff=cutoff) if torch.cuda.is_available(): model.cuda() if args.optim == 'SGD': optimizer = torch.optim.SGD(params=model.parameters(), lr=args.lr) elif args.optim == 'rms': optimizer = torch.optim.RMSprop(params=model.parameters(), lr=args.lr, weight_decay=0.00001) else: raise Exception criterion = None if args.adasoft: criterion = AdaptiveLoss([*cutoff, ntokens + 1]) else: criterion = nn.CrossEntropyLoss()
train_data = create_batch(corpus.train, batch_size) """ ----------- Model Creation ------------""" number_tokens = len(corpus.dictionary) # Number of unique word in our corpus model = RNNModel(rnn_type = rt, ntoken = number_tokens, ninp = embedding_size, nhid = number_hidden, nlayers = number_layer, drop_rate = dropout, tie_weights = tied) if cuda and torch.cuda.is_available(): model = model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr = learning_rate) """ ----------- Training Code ------------""" def detach_hidden(h): # detach from distant history if type(h) == V: return V(h.data) else: return tuple(detach_hidden(v) for v in h) def get_batch(source, i, sequence_length): seq_len = min(sequence_length, len(source) - 1 - i) # torch.cat([data.data.view(-1).unsqueeze(-1), target.data.unsqueeze(-1)], dim=1)
def run(args): np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### def model_save(fn): with open(fn, 'wb') as f: torch.save([model, optimizer], f) def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: model, optimizer = torch.load(f) import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) # get token frequencies and eos_tokens frequencies, eos_tokens = None, None if not args.uni_freq: frequencies = corpus.frequencies if args.reinit_h: eos_tokens = corpus.reset_idxs # batchify eval_batch_size = 1 test_batch_size = 1 print(corpus.dictionary) if args.reinit_h: ntokens = len(corpus.dictionary) + 1 if args.batch_size > 1 else len(corpus.dictionary) train_data, seq_lens = batchify_padded(corpus.train, args.batch_size, args, ntokens, eos_tokens) else: ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### model = RNNModel(ntokens, args.emsize, args.nhid, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.nsamples, args.temperature, frequencies, args.no_bias, args.bias_reg, args.dist_fn, args.activation_fn) ### if args.resume: print('Resuming model ...') model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute ### if args.cuda: model = model.cuda() ### params = list(model.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ############################################################################### def evaluate(data_source, epoch, batch_size=1): # Turn on evaluation mode which disables dropout. model.eval() if args.dump_hiddens: loss, entropy, hiddens = model.evaluate(data_source, eos_tokens, args.dump_hiddens) dump_hiddens(hiddens, 'hiddens_' + str(epoch)) else: loss, entropy = model.evaluate(data_source, eos_tokens) if args.dump_words: dump_words(model.encoder.weight.detach().cpu().numpy(), 'words_' + str(epoch)) if not args.dump_entropy is None: dump(entropy, args.dump_entropy + str(epoch)) return loss def train(): # Turn on training mode which enables dropout. total_loss, avrg_loss = 0, 0 start_time = time.time() ntokens = len(corpus.dictionary) batch, i = 0, 0 hidden = model.init_hidden(args.batch_size) while i < train_data.size(0)-1: if args.reinit_h: seq_len = seq_lens[batch] - 1 else: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # prevent negative sequence lengths # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. reset_hidden = args.reinit_h if reset_hidden: hidden = model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) optimizer.zero_grad() #raw_loss = model.train_crossentropy(data, eos_tokens) raw_loss, hidden = model(data, hidden) loss = raw_loss ''' See what we can do here! We don't need the regularization as it is implicit! # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) ''' loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, cur_loss, cur_loss / math.log(2))) avrg_loss = avrg_loss + total_loss total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len + 1 return avrg_loss / train_data.size(0) # Loop over epochs. lr = args.lr best_val_loss = [] valid_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) for epoch in range(1, args.epochs+1): epoch_start_time = time.time() train_loss = train() _, s, _= np.linalg.svd(model.rnn.module.weight_hh_l0.cpu().detach().numpy()) print(s[0]) #dump(model.decoder.bias.cpu().detach().numpy(), 'bias_' + str(epoch) +'.out') # skip to beginning if not in evaluation mode if epoch % args.evaluate_every > 0: print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} |'.format( epoch, (time.time() - epoch_start_time), train_loss)) print('-' * 89) continue # evaluate validation loss if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): #if 'ax' in optimizer.state[prm]: tmp[prm] = prm.data.clone() if 'ax' in optimizer.state[prm]: prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data, epoch) valid_loss.append(val_loss2) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if val_loss2 < stored_loss: model_save(args.save) print('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(val_data, epoch, eval_batch_size) valid_loss.append(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) print('-' * 89) if val_loss < stored_loss: model_save(args.save) print('Saving model (new best validation)') stored_loss = val_loss if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) if epoch in args.when: print('Saving model before learning rate decreased') model_save('{}.e{}'.format(args.save, epoch)) print('Dividing learning rate by 10') optimizer.param_groups[0]['lr'] /= 10. best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. model_load(args.save) # Run on test data. test_loss = evaluate(test_data, args.epochs+1, test_batch_size) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format( test_loss, math.exp(test_loss), test_loss / math.log(2))) print('=' * 89) return np.array(valid_loss), test_loss
tieweights=args.tieweights) LMModel.load_state_dict(LMModel_start.state_dict()) # LMModel = torch.load(args.save).cpu() model_size = sum(p.nelement() for p in LMModel.parameters()) logging('-' * 30, f_log=f_log) logging(f'Model tatal parameters: {model_size}', f_log=f_log) logging('-' * 30, f_log=f_log) # print('-' * 30) # print(f'Model tatal parameters: {model_size}') # print('-' * 30) if torch.cuda.is_available() and cuda_device is not 'cpu': LMModel = LMModel.cuda(cuda_device) LMModel_parallel = None if torch.cuda.is_available() and args.devids is not 'off': LMModel_parallel = torch.nn.DataParallel(LMModel, device_ids=device_ids, output_device=output_device, dim=1) # .cuda() is necessary if LMModel was not on any GPU device # LMModel_parallel._modules['module'].lstm.flatten_parameters() if args.optim == 'SGD': optimizer = optim.SGD(LMModel.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd)
def __init__(self, save_path, seed, batch_size, grad_clip, config='eval'): if config == 'search': args = { 'emsize': 300, 'nhid': 300, 'nhidlast': 300, 'dropoute': 0, 'wdecay': 5e-7 } elif config == 'eval': args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['config'] = config args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = grad_clip args['batch_size'] = batch_size args['search_batch_size'] = 256 * 4 args['small_batch_size'] = batch_size args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = seed args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = save_path args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args = AttrDict(args) self.args = args self.seed = seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus eval_batch_size = 10 test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, eval_batch_size, args) self.test_data = batchify(corpus.test, test_batch_size, args) self.batch = 0 self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() ntokens = len(corpus.dictionary) # if args.continue_train: # model = torch.load(os.path.join(args.save, 'model.pt')) try: model = torch.load(os.path.join(args.save, 'model.pt')) print('Loaded model from checkpoint') except Exception as e: print(e) model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = model.cuda() self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
class DartsTrainer(): def __init__(self, arm): # Default params for eval network args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = 0.25 args['batch_size'] = 64 args['search_batch_size'] = 256 * 4 args['small_batch_size'] = 64 args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = arm['seed'] args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = arm['dir'] args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args['genotype'] = arm['genotype'] args = AttrDict(args) self.args = args self.epoch = 0 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus self.eval_batch_size = 10 self.test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, self.eval_batch_size, args) self.test_data = batchify(corpus.test, self.test_batch_size, args) self.ntokens = len(corpus.dictionary) def model_save(self, fn, to_save): if self.epoch % 150 == 0: with open( os.path.join(self.args.save, "checkpoint-incumbent-%d" % self.epoch), 'wb') as f: torch.save(to_save, f) with open(fn, 'wb') as f: torch.save(to_save, f) def model_load(self, fn): with open(fn, 'rb') as f: self.model, self.optimizer, rng_state, cuda_state = torch.load(f) torch.set_rng_state(rng_state) torch.cuda.set_rng_state(cuda_state) def model_resume(self, filename): logging.info('Resuming model from %s' % filename) self.model_load(filename) self.optimizer.param_groups[0]['lr'] = self.args.lr for rnn in self.model.rnns: rnn.genotype = self.args.genotype def train_epochs(self, epochs): args = self.args resume_filename = os.path.join(self.args.save, "checkpoint.incumbent") if os.path.exists(resume_filename): self.model_resume(resume_filename) logging.info('Loaded model from checkpoint') else: self.model = RNNModel(self.ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=args.genotype) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr, weight_decay=args.wdecay) size = 0 for p in self.model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(self.model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in self.model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = self.model.cuda() # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(epochs): epoch_start_time = time.time() self.train() if 't0' in self.optimizer.param_groups[0]: tmp = {} for prm in self.model.parameters(): tmp[prm] = prm.data.clone() prm.data = self.optimizer.state[prm]['ax'].clone() val_loss2 = self.evaluate(self.val_data) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) logging.info('-' * 89) if val_loss2 < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving Averaged!') stored_loss = val_loss2 for prm in self.model.parameters(): prm.data = tmp[prm].clone() else: val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) logging.info('-' * 89) if val_loss < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving model (new best validation)') stored_loss = val_loss if (self.epoch > 75 and 't0' not in self.optimizer.param_groups[0] and (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono]))): logging.info('Switching to ASGD') self.optimizer = torch.optim.ASGD( self.model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except Exception as e: logging.info('-' * 89) logging.info(e) logging.info('Exiting from training early') return 0, 10000, 10000 # Load the best saved model. self.model_load(os.path.join(args.save, 'checkpoint.incumbent')) # Run on test data. val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info(math.exp(val_loss)) test_loss = self.evaluate(self.test_data, self.test_batch_size) logging.info('=' * 89) logging.info( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format(test_loss, math.exp(test_loss), test_loss / math.log(2))) logging.info('=' * 89) return 0, math.exp(val_loss), math.exp(test_loss) def train(self): args = self.args corpus = self.corpus total_loss = 0 start_time = time.time() hidden = [ self.model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < self.train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_length_delta) lr2 = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt self.model.train() data, targets = get_batch(self.train_data, i, args, seq_len=seq_len) self.optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous( ).view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = self.model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(self.model.parameters(), args.clip) self.optimizer.step() # total_loss += raw_loss.data self.optimizer.param_groups[0]['lr'] = lr2 if np.isnan(total_loss[0]): raise #if batch % args.log_interval == 0 and batch > 0: # cur_loss = total_loss[0] / args.log_interval # elapsed = time.time() - start_time # logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'loss {:5.2f} | ppl {:8.2f}'.format( # self.epoch, batch, len(self.train_data) // args.bptt, self.optimizer.param_groups[0]['lr'], # elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) # total_loss = 0 # start_time = time.time() batch += 1 i += seq_len self.epoch += 1 def evaluate(self, data_source, batch_size=10): # Turn on evaluation mode which disables dropout. self.model.eval() total_loss = 0 hidden = self.model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, self.args.bptt): data, targets = get_batch(data_source, i, self.args, evaluation=True) targets = targets.view(-1) log_prob, hidden = self.model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def train(): # 载入数据与配置模型 print("Loading data...") corpus = Corpus(train_dir) print(corpus) config = Config() config.vocab_size = len(corpus.dictionary) train_data = batchify(corpus.train, config.batch_size) train_len = train_data.size(0) seq_len = config.seq_len print("Configuring model...") model = RNNModel(config) if use_cuda: model.cuda() print(model) criterion = nn.CrossEntropyLoss() lr = config.learning_rate # 初始学习率 start_time = time.time() print("Training and generating...") for epoch in range(1, config.num_epochs + 1): # 多轮次训练 total_loss = 0.0 model.train() # 在训练模式下dropout才可用。 hidden = model.init_hidden(config.batch_size) # 初始化隐藏层参数 for ibatch, i in enumerate(range(0, train_len - 1, seq_len)): data, targets = get_batch(train_data, i, seq_len) # 取一个批次的数据 # 在每批开始之前,将隐藏的状态与之前产生的结果分离。 # 如果不这样做,模型会尝试反向传播到数据集的起点。 hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, config.vocab_size), targets) loss.backward() # 反向传播 # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。 torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) for p in model.parameters(): # 梯度更新 p.data.add_(-lr, p.grad.data) total_loss += loss.data # loss累计 if ibatch % config.log_interval == 0 and ibatch > 0: # 每隔多少个批次输出一次状态 cur_loss = total_loss[0] / config.log_interval elapsed = get_time_dif(start_time) print( "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}" .format(epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed)) total_loss = 0.0 lr /= 4.0 # 在一轮迭代完成后,尝试缩小学习率 # 每隔多少轮次保存一次模型参数 if epoch % config.save_interval == 0: torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch))) print(''.join(generate(model, corpus.dictionary.idx2word)))
def build_model(resume): """Build the model according to CLI arguments Global Dependencies: - corpus - args """ if resume != "": model = torch.load(resume) for param in model.parameters(): param.requires_grad = False if param.shape[0] == ntoken and param.shape[1] >= 1: param.requires_grad = True print(param.shape, param.requires_grad) return model # noise for soise sampling in NCE noise = build_unigram_noise(torch.FloatTensor(corpus.vocab.idx2count)) norm_term = 'auto' if args.norm_term == -1 else args.norm_term # setting up NCELoss modules if args.index_module == 'linear': criterion = IndexLinear( args.nhid, ntoken, args.trick, noise=noise, noise_ratio=args.noise_ratio, norm_term=norm_term, theta=args.theta, loss_type=args.loss, reduction='none', sample_with_replacement=args.sample_with_replacement, grouping=args.sample_with_grouping) model = RNNModel( ntoken, args.emsize, args.nhid, args.nlayers, criterion=criterion, dropout=args.dropout, ) elif args.index_module == 'gru': if args.nlayers != 1: logger.warning( 'Falling into one layer GRU due to Index_GRU supporting') nce_criterion = IndexGRU( ntoken, args.nhid, args.nhid, args.dropout, noise=noise, noise_ratio=args.noise_ratio, norm_term=norm_term, ) model = GenModel(criterion=nce_criterion, ) else: logger.error('The index module [%s] is not supported yet' % args.index_module) raise (NotImplementedError('index module not supported')) if args.cuda: model.cuda() logger.info('model definition:\n %s', model) return model
def main(): torch_num_threads = 25 torch.set_num_threads(torch_num_threads) ''' Main function''' parser = argparse.ArgumentParser() #parser.add_argument('-data', required=True) parser.add_argument('-torch_threads', type=int, default=25) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=8) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=8) parser.add_argument('-d_inner_hid', type=int, default=8) parser.add_argument('-n_warmup_steps', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='model') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-network', type=int, default=0) # use social network; need features or deepwalk embeddings as initial input parser.add_argument('-pos_emb', type=int, default=1) parser.add_argument('-warmup', type=int, default=3) # warmup epochs parser.add_argument('-notes', default='') parser.add_argument('-data_name', default='twitter') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if opt.network==1: opt.network = True else: opt.network = False if opt.pos_emb==1: opt.pos_emb = True else: opt.pos_emb = False print(opt.notes) #========= Preparing DataLoader =========# train_data = DataLoader(opt.data_name, data=0, load_dict=True, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) valid_data = DataLoader(opt.data_name, data=1, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) test_data = DataLoader(opt.data_name, data=2, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network) opt.user_size = train_data.user_size if opt.network: opt.net = train_data._adj_list opt.net_dict = train_data._adj_dict_list opt.embeds = train_data._embeds #========= Preparing Model =========# #print(opt) decoder = RNNModel('GRUCell', opt) RLLearner = RRModel(decoder) #print(transformer) optimizer = ScheduledOptim( optim.Adam( RLLearner.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(user_size): ''' With PAD token zero weight ''' weight = torch.ones(user_size) weight[Constants.PAD] = 0 weight[Constants.EOS] = 1 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(train_data.user_size) if opt.cuda: decoder = decoder.cuda() RLLearner = RLLearner.cuda() crit = crit.cuda() train(RLLearner, train_data, valid_data, test_data, crit, optimizer, opt)
def __init__(self, save_path,seed=111,val_interval = 20, val_times=1,controller = None, batch_size=128, grad_clip=0.1, config='eval'): args = {'emsize':850, 'nhid':850, 'nhidlast':850, 'dropoute':0.1, 'wdecay':8e-7} args['config'] = config args['data'] = '../data/penn' args['lr'] = 20 args['clip'] = grad_clip args['batch_size'] = batch_size args['search_batch_size'] = 256*4 args['small_batch_size'] = batch_size args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = seed args['nonmono'] = 5 args['log_interval'] = val_interval args['val_times'] = val_times args['save'] = save_path args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args = AttrDict(args) self.args = args self.seed = seed self.controller = controller np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled=True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus eval_batch_size = 64 test_batch_size = 1 args.eval_batch_size = eval_batch_size self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) # self.val_data = batchify(corpus.train[464794:], eval_batch_size, args) # self.test_data = batchify(corpus.test, test_batch_size, args) # raw_data = batchify(corpus.train, batch_size, None) # indx = np.arange(14524) # random.shuffle(indx) # self.train_data = raw_data[indx[0:int(14524/2)],:] # self.val_data = raw_data[indx[int(14524/2):],:] raw_data = batchify(corpus.valid, 1, None) val_data = [] for i in range(len(raw_data)-1-args.bptt): val_data.append(raw_data[i:i+args.bptt+1]) val_data = torch.cat(val_data,1) self.val_data = val_data print(self.train_data.shape) print(self.search_data.shape) print(self.val_data.shape) self.batch = 0 self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() ntokens = len(corpus.dictionary) #if args.continue_train: # model = torch.load(os.path.join(args.save, 'model.pt')) # try: # model = torch.load(os.path.join(args.save, 'model.pt')) # print('Loaded model from checkpoint') # except Exception as e: # print(e) model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = model.cuda() self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
else: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt='%H:%M:%S', filename=os.path.join(args.out, 'train.log'), level=logging.INFO) tb.configure(args.out) random.seed(1024) torch.manual_seed(1024) torch.cuda.manual_seed_all(1024) model = RNNModel(123, 62, 250, 3, args.dropout, bidirectional=args.bi) if args.init: model.load_state_dict(torch.load(args.init)) else: for param in model.parameters(): torch.nn.init.uniform(param, -0.1, 0.1) if args.cuda: model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=.9) criterion = CTCLoss() # data set trainset = SequentialLoader('train', args.batch_size) devset = SequentialLoader('dev', args.batch_size) tri = cvi = 0 def eval(): global cvi losses = [] tacc = TokenAcc()
def main(): # Add ckp parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument( '--data', type=str, default='/input', # /input help='location of the data corpus') parser.add_argument('--checkpoint', type=str, default='', help='model checkpoint to use') parser.add_argument( '--model', type=str, default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') parser.add_argument('--emsize', type=int, default=200, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=200, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=2, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=256, metavar='N', help='batch size') parser.add_argument('--dropout', type=float, default=0.2, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--tied', action='store_true', help='tie the word embedding and softmax weights') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument( '--save', type=str, default='/output/model.pt', # /output help='path to save the final model') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) # Load checkpoint build_vocab = False if args.checkpoint != '' and os.path.exists(args.checkpoint): print(f'Loading field from {args.checkpoint}') save_dict = torch.load(args.checkpoint) field = save_dict['field'] start_epoch = save_dict['start_epoch'] else: save_dict = None field = Field(tokenize=split_tokenize, init_token='<init>') build_vocab = True start_epoch = 0 ############################################################################### # Load data ############################################################################### train_data, val_data, test_data = TabularDataset.splits( path=args.data, train='train.txt', validation='valid.txt', test='test.txt', format='tsv', fields=[('text', field)]) print(train_data, len(train_data), val_data, len(val_data), test_data, len(test_data)) if build_vocab: field.eos_token = '<eos>' field.build_vocab(train_data, val_data, min_freq=1000) field.eos_token = None eos_id = field.vocab.stoi['<eos>'] pad_id = field.vocab.stoi[field.pad_token] train_iter = BucketIterator(train_data, args.batch_size, train=True, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') val_iter = Iterator(val_data, args.batch_size, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') test_iter = Iterator(test_data, args.batch_size, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') print(train_iter, len(train_iter), val_iter, len(val_iter), test_iter, len(test_iter)) ############################################################################### # Build the model ############################################################################### ntokens = len(field.vocab) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if save_dict is not None: model.load_state_dict(save_dict['model']) if args.cuda: model.cuda() else: model.cpu() print(model) if save_dict: opt = save_dict['optimizer'] else: opt = torch.optim.Adam(model.parameters(), lr=args.lr) if args.checkpoint: torch.save( dict(field=field, model=model.state_dict(), optimizer=opt, start_epoch=start_epoch), args.checkpoint) ############################################################################### # Training code ############################################################################### criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id) def make_target(text): batch_size = text.size()[1] eos_vector = torch.full((1, batch_size), eos_id, dtype=text.dtype, device='cuda:0' if args.cuda else 'cpu:0') target = torch.cat((text[1:], eos_vector), dim=0) return target def compute_loss(output, text): output_flat = output.view(-1, ntokens) target = make_target(text) target_flat = target.view(-1) return criterion(output_flat, target_flat) def evaluate(data_source): # Turn on evaluation mode which disables dropout. with torch.no_grad(): model.eval() total_loss = 0 for batch in data_source: output, hidden = model(batch.text) loss = compute_loss(output, batch.text) total_loss += loss.item() return total_loss / len(data_source) def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() for i, batch in enumerate(train_iter): model.zero_grad() output, hidden = model(batch.text) target = make_target(batch.text) loss = compute_loss(output, batch.text) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) opt.step() total_loss += loss.item() if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, i, len(train_iter), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() # Loop over epochs. best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(start_epoch, args.epochs): epoch_start_time = time.time() train() val_loss = evaluate(val_iter) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: if args.checkpoint: torch.save( dict(field=field, model=model.state_dict(), optimizer=opt, start_epoch=epoch), args.checkpoint) best_val_loss = val_loss except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') torch.save( dict(vocab=field.vocab.itos, model=model.state_dict(), settings=dict(rnn_type=args.model, emsize=args.emsize, nhid=args.nhid, nlayers=args.nlayers)), args.save) # Load the best saved model. #with open(args.save, 'rb') as f: # save_dict = torch.load(f) # field = save_dict['field'] # if save_dict is not None: # model.load_state_dict(save_dict['model']) # # if args.cuda: # model.cuda() # else: # model.cpu() # Run on test data. test_loss = evaluate(test_iter) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
encoder = nn.Embedding(ntokens, args.emsize, sparse=False) util.initialize(encoder.weight) twht = None if args.tied: if args.nhid != args.emsize and not args.proj: raise ValueError( 'When using the tied flag, hidden must be equal to embedding size') twht = encoder.weight D = args.emsize if args.proj else args.nhid ss = SampledSoftmax(ntokens, nsampled, D, tied_weight=twht) net.add_module("encoder", encoder) net.add_module("decoder", ss) net.cuda() tmp_net = net if world_size >= 1: tmp_net = DDP(net) tmp_net.init_hidden = net.init_hidden net = tmp_net print("Batch Size:", args.batch_size * args.scale, "Initial LR:", args.lr * args.scale) criterion = nn.CrossEntropyLoss() optimizer = Adam(net.parameters(), args.lr * args.scale, betas=(0.9, 0.999)) scheduler = LinearLR(optimizer, base_lr=args.lr * args.scale, max_iters=train_corpus.batch_num * args.epochs, last_iter=-1, min_lr=1e-8)