def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) print("Done") dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] dicts = dataset['dicts'] nSets = dicts['nSets'] print(' * Vocabulary sizes: ') for lang in dicts['langs']: print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size()) trainSets = dict() validSets = dict() for i in xrange(nSets): trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i], opt.batch_size, opt.gpus) validSets[i] = onmt.Dataset(dataset['valid']['src'][i], dataset['valid']['tgt'][i], opt.batch_size, opt.gpus) print(' * number of training sentences for set %d: %d' % (i, len(dataset['train']['src'][i]))) print(' * maximum batch size. %d' % opt.batch_size) #~ print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt'], nSets) generator = onmt.Models.Generator(opt, dicts['tgt']) model = onmt.Models.NMTModel(encoder, decoder) #~ if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = int(math.floor(checkpoint['epoch'] + 1)) if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if opt.share_embedding: model.shareEmbedding(dicts) if (not opt.train_from_state_dict and not opt.train_from) or checkpoint['optim'] is None: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) #~ #~ encoder.load_pretrained_vectors(opt) #~ decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) optim.set_learning_rate(opt.learning_rate) #~ if opt.train_from or opt.train_from_state_dict: #~ optim.optimizer.load_state_dict( #~ checkpoint['optim'].optimizer.state_dict()) if opt.train_from or opt.train_from_state_dict: del checkpoint # to save memory nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainSets, validSets, dataset, optim)
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], min(opt.batch_size, len(dataset['valid']['src'])), opt.gpus, volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt']) generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) model = onmt.Models.NMTModel(encoder, decoder) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = checkpoint['generator'] #generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus, data_type=dataset.get("type", "text"), srcFeatures=dataset['train'].get('src_features'), tgtFeatures=dataset['train'].get('tgt_features'), alignment=dataset['train'].get('alignments')) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True, data_type=dataset.get("type", "text"), srcFeatures=dataset['valid'].get('src_features'), tgtFeatures=dataset['valid'].get('tgt_features'), alignment=dataset['valid'].get('alignments')) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) if 'src_features' in dicts: for j in range(len(dicts['src_features'])): print(' * src feature %d size = %d' % (j, dicts['src_features'][j].size())) dicts = dataset['dicts'] print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') if opt.encoder_type == "text": encoder = onmt.Models.Encoder(opt, dicts['src'], dicts.get('src_features', None)) elif opt.encoder_type == "img": encoder = onmt.modules.ImageEncoder(opt) assert ("type" not in dataset or dataset["type"] == "img") else: print("Unsupported encoder type %s" % (opt.encoder_type)) decoder = onmt.Models.Decoder(opt, dicts['tgt']) if opt.copy_attn: generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt']) else: generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.share_decoder_embeddings: generator[0].weight = decoder.word_lut.weight model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: print('Multi gpu training ', opt.gpus) model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc) decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
def main(): dataset = dict() print("Loading dicts from '%s'" % opt.data + "/dicts_info.pt") dataset['dicts'] = torch.load(opt.data + "/dicts_info.pt") pairIDs = list() if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0: # find the source and target ID of the pair we need to adapt srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src) tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt) setIDs = dataset['dicts']['setIDs'] # find the pair ID that we need to adapt for i, sid in enumerate(setIDs): if sid[0] == srcID and sid[1] == tgtID: pairIDs.append(i) if len(pairIDs) == 0: pairIDs = None else: srcID = None tgtID = None pairIDs = None # convert string to IDs for easier manipulation opt.adapt_src = srcID opt.adapt_tgt = tgtID opt.pairIDs = pairIDs dict_checkpoint = opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) #~ dataset['dicts'] = checkpoint['dicts'] else: checkpoint = None dicts = dataset['dicts'] dataset['valid'] = torch.load(opt.data + "/valid.pt") valid_set = dataset['valid'] #~ print("Loading training data from '%s'" % opt.data + "/train.pt.*") dataset['train'] = dict() #~ torch.load(opt.data + "/train.pt.0") print("Done") nSets = dicts['nSets'] setIDs = dicts['setIDs'] print(' * Vocabulary sizes: ') for lang in dicts['langs']: print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size()) # A wrapper to manage data loading trainLoader = onmt.MultiShardLoader(opt, dicts) trainSets = dict() validSets = dict() for i in xrange(nSets): #~ trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i], #~ opt.batch_size, opt.gpus) validSets[i] = onmt.Dataset(valid_set['src'][i], valid_set['tgt'][i], opt.batch_size, opt.gpus) #~ print(' * number of training sentences for set %d: %d' % #~ (i, len(dataset['train']['src'][i]))) print('[INFO] * maximum batch size. %d' % opt.batch_size) print('[INFO] Building model...') model, generator = build_model(opt, dicts, nSets) if opt.train_from_state_dict: print('[INFO] Loading model from checkpoint at %s' % opt.train_from_state_dict) model_state_dict = {k: v for k, v in checkpoint['model'].items() if 'critic' not in k} checkpoint['critic'] = {k: v for k, v in checkpoint['model'].items() if 'critic' in k} model.load_state_dict(model_state_dict) generator.load_state_dict(checkpoint['generator']) if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() model.generator = generator if not opt.train_from_state_dict : for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) nParams = sum([p.nelement() for p in model.parameters()]) print('[INFO] * number of parameters: %d' % nParams) evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1)) if opt.reinforce: if opt.critic == 'self': trainer = SCSTTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) else: from onmt.ModelConstructor import build_critic from onmt.trainer.ActorCriticTrainer import A2CTrainer critic = build_critic(opt, dicts) model.critic = critic trainer = A2CTrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) #~ raise NotImplementedError else: trainer = XETrainer(model, trainLoader, validSets, dataset, optim, evaluator, opt) trainer.run(checkpoint=checkpoint)
def main(): print("Loading data from '%s'" % opt.data) train = torch.load(opt.data + '.train.pt') valid = torch.load(opt.data + '.valid.pt') fields = onmt.IO.ONMTDataset.load_fields( torch.load(opt.data + '.vocab.pt')) fields = dict([(k, f) for (k, f) in fields.items() if k in train.examples[0].__dict__]) train.fields = fields valid.fields = fields # TODO: account for target features. Also, why does fields need to # have the structure it does? src_features = [fields["src_feat_"+str(j)] for j in range(train.nfeatures)] model_opt = opt checkpoint = None if opt.train_from: print('Loading dicts from checkpoint at %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) fields = onmt.IO.ONMTDataset.load_fields(checkpoint['vocab']) model_opt = checkpoint["opt"] print(' * vocabulary size. source = %d; target = %d' % (len(fields['src'].vocab), len(fields['tgt'].vocab))) for j, feat in enumerate(src_features): print(' * src feature %d size = %d' % (j, len(feat.vocab))) print(' * number of training sentences. %d' % len(train)) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') model = onmt.ModelConstructor.make_base_model(model_opt, fields, use_gpu(opt), checkpoint) if len(opt.gpuid) > 1: print('Multi gpu training ', opt.gpuid) model = nn.DataParallel(model, device_ids=opt.gpuid, dim=1) print(model) # Load model from checkpoint or initialize, create optim if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) # I don't like reassigning attributes of opt: it's not clear opt.start_epoch = checkpoint['epoch'] + 1 print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) else: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) model.encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc, opt.fix_word_vecs_enc) model.decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec, opt.fix_word_vecs_dec) # what members of opt does Optim need? optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt ) optim.set_parameters(model.parameters()) tally_parameters(model) check_model_path() train_model(model, train, valid, fields, optim)
def __init__(self, device, train_data, valid_data, dicts, opt, setup_optimizer=True): """ :param model: :param device: int (GPU id) :param loss_function: :param train_data: :param valid_data: :param dicts: :param opt: """ self.device = device opt.node_rank = 0 opt.nodes = 1 self.world_size = len(opt.gpus) # in the case of single node distributed, it should equal self.device self.rank = self.device # make a group to later use with self.all_reduce self.group = dist.group.WORLD self.print("[INFO] Training Options:", opt) if self.world_size > 1: dist.init_process_group(backend='nccl', init_method='env://', world_size=self.world_size, rank=self.rank) self.model = None if self.rank == 0: self.train_data = train_data self.valid_data = valid_data else: # Do we really need to deepcopy the data instances (which could cause memory leak easily) self.train_data = copy.deepcopy(train_data) self.valid_data = copy.deepcopy(valid_data) self.dicts = dicts self.opt = opt self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0) assert self.cuda, "[ERROR] Training is only available on GPUs." self.start_time = 0 # setting up models and others if opt.lfv_multilingual: from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy) self.loss_function.add_loss_function(lid_loss, 'lid_loss') torch.manual_seed(self.opt.seed) # note: we must start creating models after ccreating the processes # for some reason passing a pre-created model to a process creates a "pickle" error if not opt.fusion: if self.is_main(): print("[INFO] Building models .... ", flush=True) model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss > 0.0: from onmt.speech.ctc_loss import CTC self.ctc_loss_function = CTC(0.0, reduce=True) if opt.nce: from onmt.modules.nce.nce_loss import NCELoss loss_function = NCELoss(opt.model_size, dicts['tgt'].size(), noise_ratio=opt.nce_noise, logz=9, label_smoothing=opt.label_smoothing) else: loss_function = NMTLossFunc(opt.model_size, dicts['tgt'].size(), label_smoothing=opt.label_smoothing, mirror=opt.mirror_loss, fast_xentropy=opt.fast_xentropy) # This function replaces modules with the more optimized counterparts so that it can run faster # Currently exp with LayerNorm if not opt.memory_profiling: # distributed is required to convert BatchNorm to SyncBatchNorm for DDP optimize_model(model, distributed=(self.world_size > 1)) init_model_parameters(model, opt) self.model = model self.loss_function = loss_function self.grad_scaler = torch.cuda.amp.GradScaler() if opt.load_from: checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) self.model.load_state_dict(checkpoint['model']) if 'scaler' in checkpoint and checkpoint['scaler'] is not None: self.grad_scaler.load_state_dict(checkpoint['scaler']) if self.cuda: torch.cuda.set_device(self.device) self.loss_function = self.loss_function.cuda(device=self.device) self.model = self.model.cuda(device=self.device) if opt.ctc_loss > 0.0: self.ctc_loss_function = self.ctc_loss_function.cuda(device=self.device) # Ensure that the distributed copies have the same initial parameters # Manual seed may not work the same for different GPU models. # if self.world_size > 1: # params = [p for p in self.model.parameters()] # # with torch.no_grad(): # if not self.is_main(): # # zero everything except for the main model # for p in params: # p.zero_() # else: # for p in params: # p.add_(0) # # # run all_reduce to ensure that all models have exactly the same parameters # if self.world_size > 1: # params = [p for p in self.model.parameters()] # all_reduce_and_rescale_tensors(params, 1) if setup_optimizer: self.optim = onmt.Optim(opt) self.optim.set_parameters(self.model.parameters()) if self.is_main(): print("[INFO] Optimizer: ", self.optim.optimizer) if opt.load_from: if 'optim' in checkpoint and checkpoint['optim'] is not None and not opt.reset_optim: self.optim.load_state_dict(checkpoint['optim']) if self.world_size > 1: # find_unused_parameters may be required for dropped layer (parameters that are not connected to # any particular graph) find_unused_parameters = True self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=find_unused_parameters) print("[INFO] Process %d ready." % self.rank, flush=True)
def build_optim(model, text_model, speech_model, checkpoint): if opt.train_from: print('Loading optimizer from checkpoint.') optim = checkpoint['optim'] optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) else: # what members of opt does Optim need? optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.rnn_size) optim.set_parameters(model.parameters()) optim.set_parameters(text_model.encoder.parameters()) if speech_model: optim.set_parameters(speech_model.decoder.parameters()) optim.set_parameters(speech_model.globalEncoder.parameters()) try: print('Loading speech optimizer from checkpoint.') optim = checkpoint['speech_optim'] speech_optim.optimizer.load_state_dict( checkpoint['speech_optim'].optimizer.state_dict()) except: # what members of opt does Optim need? speech_optim = onmt.Optim( opt.speech_optim, opt.speech_learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.rnn_size) speech_optim.set_parameters(speech_model.parameters()) try: adv_optim = checkpoint['adv_optim'] adv_optim.optimizer.load_state_dict( checkpoint['adv_optim'].optimizer.state_dict()) except: # what members of opt does Optim need? adv_optim = onmt.Optim( opt.adv_optim, opt.adv_learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.rnn_size) if not opt.feature_match: adv_optim.set_parameters(model.encoder.parameters()) adv_optim.set_parameters(text_model.encoder.parameters()) else: if opt.gen_label == 0.1: # move text to match speech print('gen_label = 0.1: adv training only modifies text encodings') adv_optim.set_parameters(text_model.encoder.parameters()) else: # move speech to match text adv_optim.set_parameters(text_model.encoder.parameters()) # get rid of this later adv_optim.set_parameters(model.encoder.parameters()) return optim, adv_optim, speech_optim
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData, validData = init_dataloaders(dataset, opt) vocabulary_size = 0 if "settings" in dataset: vocabulary_size = dataset['dicts']['src']['kwargs']['vocab_size'] else: vocabulary_size = dataset['dicts']['src'].size() print(' * vocabulary size. source = %d;' % vocabulary_size) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') model = onmt.CNNModels.ConvNet(opt, vocabulary_size) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} model.load_state_dict(model_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) model.load_pretrained_vectors(opt) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim, opt)
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) print("Done") dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] dicts = dataset['dicts'] nSets = dicts['nSets'] print(' * Vocabulary sizes: ') for lang in dicts['langs']: print(' * ' + lang + ' = %d' % dicts['vocabs'][lang].size()) trainSets = dict() validSets = dict() for i in xrange(nSets): trainSets[i] = onmt.Dataset(dataset['train']['src'][i], dataset['train']['tgt'][i], opt.batch_size, opt.gpus) validSets[i] = onmt.Dataset(dataset['valid']['src'][i], dataset['valid']['tgt'][i], opt.batch_size, opt.gpus) print(' * number of training sentences for set %d: %d' % (i, len(dataset['train']['src'][i]))) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt'], nSets) generator = onmt.Models.Generator(opt, dicts['tgt']) model = onmt.Models.NMTModel(encoder, decoder) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = int(math.floor(checkpoint['epoch'] + 1)) if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if opt.share_embedding: model.shareEmbedding(dicts) if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) elif not opt.reset_optim and 'optim' in checkpoint: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] else: optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) optim.set_parameters(model.parameters()) optim.set_learning_rate(opt.learning_rate) #~ if opt.train_from or opt.train_from_state_dict: #~ optim.optimizer.load_state_dict( #~ checkpoint['optim'].optimizer.state_dict()) if opt.train_from or opt.train_from_state_dict: del checkpoint # to save memory nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) if len(opt.adapt_src) > 0 and len(opt.adapt_tgt) > 0: # find the source and target ID of the pair we need to adapt srcID = dataset['dicts']['srcLangs'].index(opt.adapt_src) tgtID = dataset['dicts']['tgtLangs'].index(opt.adapt_tgt) setIDs = dataset['dicts']['setIDs'] # find the pair ID that we need to adapt pairID = -1 for i, sid in enumerate(setIDs): if sid[0] == srcID and sid[1] == tgtID: pairID = i break if pairID == -1: pairID = None else: srcID = None tgtID = None pairID = None # convert string to IDs for easier manipulation opt.adapt_src = srcID opt.adapt_tgt = tgtID opt.pairID = pairID evaluator = Evaluator(model, dataset, opt, cuda=(len(opt.gpus) >= 1)) if opt.reinforce: trainer = SCSTTrainer(model, trainSets, validSets, dataset, optim, evaluator, opt) else: trainer = XETrainer(model, trainSets, validSets, dataset, optim, evaluator, opt) trainer.run()
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.cuda) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.cuda) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') if opt.train_from is None: encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt']) decoderlatent = onmt.Models.DecoderLatent(opt) encoderlatent = onmt.Models.EncoderLatent(opt) lengthnet = onmt.Models.LengthNet(opt) generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.cuda > 1: generator = nn.DataParallel(generator, device_ids=opt.gpus) model = onmt.Models.NMTModel(encoder, lengthnet, decoderlatent, encoderlatent, decoder, generator, opt) if opt.cuda > 1: model = nn.DataParallel(model, device_ids=opt.gpus) if opt.cuda: model.cuda() else: model.cpu() #model.generator = generator for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) optim = onmt.Optim(model.parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading from checkpoint at %s' % opt.train_from) checkpoint = torch.load(opt.train_from) model = checkpoint['model'] if opt.cuda: model.cuda() else: model.cpu() optim = checkpoint['optim'] opt.start_epoch = checkpoint['epoch'] + 1 nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
def __init__(self, device, train_data, valid_data, dicts, opt, setup_optimizer=True): """ :param model: :param device: int (GPU id) :param loss_function: :param train_data: :param valid_data: :param dicts: :param opt: """ self.device = device opt.node_rank = 0 opt.nodes = 1 self.world_size = len(opt.gpus) # in the case of single node distributed, it should equal self.device self.rank = self.device # make a group to later use with self.all_reduce self.group = dist.group.WORLD self.print("[INFO] Training Options:", opt) if self.world_size > 1: dist.init_process_group(backend='nccl', init_method='env://', world_size=self.world_size, rank=self.rank) self.model = None if self.rank == 0: self.train_data = train_data self.valid_data = valid_data else: # Do we really need to deepcopy the data instances (which could cause memory leak easily) self.train_data = copy.deepcopy(train_data) self.valid_data = copy.deepcopy(valid_data) self.dicts = dicts self.opt = opt self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0) assert self.cuda, "[ERROR] Training is only available on GPUs." self.start_time = 0 # setting up models and others if opt.lfv_multilingual: from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy) self.loss_function.add_loss_function(lid_loss, 'lid_loss') torch.manual_seed(self.opt.seed) # note: we must start creating models after ccreating the processes # for some reason passing a pre-created model to a process creates a "pickle" error if not opt.fusion: if self.is_main(): print("[INFO] Building models .... ", flush=True) model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss > 0.0: from onmt.speech.ctc_loss import CTC self.ctc_loss_function = CTC(0.0, reduce=True) if opt.nce: from onmt.modules.nce.nce_loss import NCELoss loss_function = NCELoss(opt.model_size, dicts['tgt'].size(), noise_ratio=opt.nce_noise, logz=9, label_smoothing=opt.label_smoothing) else: loss_function = NMTLossFunc( opt.model_size, dicts['tgt'].size(), label_smoothing=opt.label_smoothing, mirror=opt.mirror_loss, fast_xentropy=opt.fast_xentropy) # This function replaces modules with the more optimized counterparts so that it can run faster # Currently exp with LayerNorm if not opt.memory_profiling: # distributed is required to convert BatchNorm to SyncBatchNorm for DDP optimize_model(model, distributed=(self.world_size > 1)) init_model_parameters(model, opt) self.model = model self.loss_function = loss_function # self.grad_scaler = torch.cuda.amp.GradScaler() if self.cuda: torch.cuda.set_device(self.device) self.loss_function = self.loss_function.cuda(device=self.device) self.model = self.model.cuda(device=self.device) if opt.ctc_loss > 0.0: self.ctc_loss_function = self.ctc_loss_function.cuda( device=self.device) if opt.load_from: checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) if setup_optimizer: self.optim = onmt.Optim(opt) self.optim.set_parameters(self.model.parameters()) if self.is_main(): print("[INFO] Optimizer: ", self.optim.optimizer) if opt.load_from: if 'optim' in checkpoint and checkpoint[ 'optim'] is not None and not opt.reset_optim: self.optim.load_state_dict(checkpoint['optim']) if not self.opt.fp16: opt_level = "O0" keep_batchnorm_fp32 = False elif self.opt.fp16_mixed: opt_level = "O1" keep_batchnorm_fp32 = None else: opt_level = "O2" keep_batchnorm_fp32 = False self.opt_level = opt_level if self.cuda: self.model, self.optim.optimizer = amp.initialize( self.model, self.optim.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 1) if opt.load_from: self.model.load_state_dict(checkpoint['model']) if prec_opt is not None and hasattr(prec_opt, "fp16_mixed"): # Only load amp information if the mode is the same # Maybe its better to change between optimization mode? if opt.fp16_mixed == prec_opt.fp16_mixed and opt.fp16 == prec_opt.fp16: if 'amp' in checkpoint: try: amp.load_state_dict(checkpoint['amp']) except Exception: # loading the amp state can fail pass if self.world_size > 1: # find_unused_parameters may be required for dropped layer (parameters that are not connected to # any particular graph) # find_unused_parameters = True self.model = DDP(self.model, delay_allreduce=True, gradient_average=False) print("[INFO] Process %d ready." % self.rank, flush=True)
def train(opt, dataset): if torch.cuda.is_available() and not opt.gpus: print("WARNING: You have a CUDA device, so you should probably run with -gpus 0") if opt.gpus: cuda.set_device(opt.gpus[0]) opt.cuda = True else: opt.cuda = False ckpt_path = opt.train_from if ckpt_path: print('Loading dicts from checkpoint at %s' % ckpt_path) checkpoint = torch.load(ckpt_path) opt = checkpoint['opt'] print("Loading data from '%s'" % opt.data) if ckpt_path: dataset['dicts'] = checkpoint['dicts'] model_dir = os.path.dirname(opt.save_model) if not os.path.isdir(model_dir): os.mkdir(model_dir) trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') assert dicts['src'].size() == dicts['tgt'].size() dict_size = dicts['src'].size() word_lut = nn.Embedding(dicts['src'].size(), opt.word_vec_size, padding_idx=onmt.Constants.PAD) generator = nn.Sequential( nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) encoder = onmt.Models.Encoder(opt, word_lut) decoder = onmt.Models.Decoder(opt, word_lut, generator) model = onmt.Models.NMTModel(encoder, decoder, opt) if ckpt_path: print('Loading model from checkpoint at %s' % ckpt_path) model.load_state_dict(checkpoint['model']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) if not ckpt_path: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) optim.set_parameters(model.parameters()) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] optim.set_parameters(model.parameters()) optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict()) if ckpt_path: stats = checkpoint['stats'] else: stats = {'train_loss': [], 'train_KLD': [], 'train_KLD_obj': [], 'train_accuracy': [], 'kl_rate': [], 'valid_loss': [], 'valid_KLD': [], 'valid_accuracy': [], 'valid_lm_nll', 'step': []} nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) best_valid_lm_nll = trainModel(model, trainData, validData, dataset, optim, stats, opt) return best_valid_lm_nll
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.cuda) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.cuda, volatile=True) if opt.raml_alpha: print("Use RAML(alpha) ...") print("tau: {}".format(opt.tau)) print("alpha: {}".format(opt.alpha)) sampler = onmt.HammingDistanceSampler( temperature=opt.tau, max_len=55, voc_min=4, voc_max=dataset['dicts']['tgt'].size() - 4) trainData = onmt.ISDataset(trainData, sampler) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt']) generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) model = onmt.Models.NMTModel(encoder, decoder) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = checkpoint['generator'] model_state_dict = {k: v for k, v in chk_model if 'generator' not in k} model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if opt.cuda: model.cuda() generator.cuda() else: model.cpu() generator.cpu() model.generator = generator if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] optim.lr = opt.learning_rate optim.start_decay_at = opt.start_decay_at optim.lr_decay = opt.learning_rate_decay optim.start_decay = False print(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim) print("\nBest: Valid BLEU: {}, Test BLEU: {} @epoch {}\n".format( max_valid[0], max_test[0], max_epoch[0])) print("Epoch, Valid BLEU, Test BLEU") print("-" * 30) for score in scores: epoch, valid_bleu, test_bleu = score print("{}: {}, {}".format(epoch, valid_bleu, test_bleu))
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else None if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], dataset['train']['tgt_uni'], dataset['train']['align'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], dataset['valid']['tgt_uni'], dataset['valid']['align'], opt.batch_size, opt.gpus, volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src'], opt.fix_src_emb, use_cov=True) decoder = onmt.Models.Decoder(opt, dicts['tgt'], opt.tie_emb) output_dim = opt.output_emb_size if not opt.nonlin_gen: generator = nn.Sequential(nn.Linear(opt.rnn_size, output_dim)) else: #add a non-linear layer before generating the continuous vector generator = nn.Sequential(nn.Linear(opt.rnn_size, output_dim), nn.ReLU(), nn.Linear(output_dim, output_dim)) #output is just an embedding target_embeddings = nn.Embedding(dicts['tgt'].size(), opt.output_emb_size) target_uni_embeddings = nn.Embedding(dicts['tgt'].size_uni(), opt.output_emb_size) target_ngram_embeddings = nn.Embedding(dicts['tgt'].size_ngram(), opt.output_emb_size) #normalize the embeddings norm = dicts['tgt'].embeddings.norm(p=2, dim=1, keepdim=True).clamp(min=1e-12) target_embeddings.weight.data.copy_(dicts['tgt'].embeddings.div(norm)) norm = dicts['tgt'].unigram_embeddings.norm(p=2, dim=1, keepdim=True).clamp(min=1e-12) target_uni_embeddings.weight.data.copy_( dicts['tgt'].unigram_embeddings.div(norm)) norm = dicts['tgt'].ngram_embeddings.norm(p=2, dim=1, keepdim=True).clamp(min=1e-12) target_ngram_embeddings.weight.data.copy_( dicts['tgt'].ngram_embeddings.div(norm)) #target embeddings are fixed and not trained target_embeddings.weight.requires_grad = False target_uni_embeddings.weight.requires_grad = False target_ngram_embeddings.weight.requires_grad = False # elif opt.loss != "maxmargin": # with max-margin loss, the target embeddings can be fine-tuned as well. # target_embeddings.weight.requires_grad=False model = onmt.Models.NMTModel(encoder, decoder) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) generator_state_dict = checkpoint['generator'] encoder_state_dict = [('encoder.' + k, v) for k, v in checkpoint['encoder'].items()] decoder_state_dict = [('decoder.' + k, v) for k, v in checkpoint['decoder'].items()] model_state_dict = dict(encoder_state_dict + decoder_state_dict) model.load_state_dict(model_state_dict, strict=False) generator.load_state_dict(generator_state_dict) if not opt.train_anew: #load from opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() target_embeddings.cuda() target_uni_embeddings.cuda() target_ngram_embeddings.cuda() else: model.cpu() generator.cpu() target_embeddings.cpu() target_uni_embeddings.cpu() target_ngram_embeddings.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) if opt.tie_emb: decoder.tie_embeddings(target_embeddings) if opt.fix_src_emb: #fix and normalize the source embeddings source_embeddings = nn.Embedding(dicts['src'].size(), opt.output_emb_size) norm = dicts['src'].embeddings.norm(p=2, dim=1, keepdim=True).clamp(min=1e-12) source_embeddings.weight.data.copy_( dicts['src'].embeddings.div(norm)) #turn this off to initialize embeddings as well as make them trainable source_embeddings.weight.requires_grad = False if len(opt.gpus) >= 1: source_embeddings.cuda() else: source_embeddings.cpu() encoder.fix_embeddings(source_embeddings) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) elif opt.train_anew: #restart optimizer, sometimes useful for training with optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.train_from and not opt.train_anew: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum( [p.nelement() for p in model.parameters() if p.requires_grad]) print('* number of trainable parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, target_embeddings, target_uni_embeddings, target_ngram_embeddings, optim)
def main(): if torch.cuda.is_available() and not opt.gpus: print( "WARNING: You have a CUDA device, so you should probably run with -gpus 0" ) if opt.gpus: cuda.set_device(opt.gpus[0]) print(opt) if opt.seed > 0: torch.manual_seed(opt.seed) print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] if opt.keys or opt.acts: trainData = memories.Key_Dataset(dataset['train'], opt.batch_size, opt.gpus, opt.context_size) validData = memories.Key_Dataset(dataset['valid'], opt.batch_size, opt.gpus, opt.context_size, volatile=True) nr_train_points = len(dataset['train']['src_utts']) else: trainData = memories.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus, opt.context_size) validData = memories.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, opt.context_size, volatile=True) nr_train_points = len(dataset['train']['src']) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % nr_train_points) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') model = memories.hier_model.HierModel(opt, dicts) generator = nn.Sequential( nn.Linear(opt.word_vec_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] # generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) # generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) # encoder.load_pretrained_vectors(opt) # decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) if opt.gather_net_data: # , opt.n_samples) return gather_data(model, validData, dataset['dicts']) low_ppl, best_e, trn_ppls, val_ppls, checkpoint = trainModel( model, trainData, validData, dataset, optim) return low_ppl, best_e, trn_ppls, val_ppls, checkpoint, opt, nParams
def main(): data = "../../data_%s/%s/%s-train.pt" % (opt.task, opt.data, opt.data) print("Loading data from '%s'" % data) if opt.label_smooth: assert opt.num_rb_bin == 2 dataset = torch.load(data) if opt.separate_threshold: print dataset["src_threshold"] print dataset["tgt_threshold"] threshold = { "src": dataset["src_threshold"][opt.num_rb_bin], "tgt": dataset["tgt_threshold"][opt.num_rb_bin] } else: if opt.num_rb_bin > 0: single_threshold = dataset['all_threshold'][opt.num_rb_bin] else: single_threshold = [0] threshold = {"src": single_threshold, "tgt": single_threshold} print threshold dicts = dataset['dicts'] ori_datasets = copy.deepcopy(dataset) if opt.parallel_ratio is not None: parallel_len = l = int( len(dataset['train']['src']) * opt.parallel_ratio) dataset['train']['src'] = dataset['train']['src'][:l] print dataset['train']['src'][-1] dataset['train']['tgt'] = dataset['train']['tgt'][:l] dataset['train']['src_rb'] = dataset['train']['src_rb'][:l] dataset['train']['tgt_rb'] = dataset['train']['tgt_rb'][:l] else: parallel_len = None if opt.separate_encoder == 0: forward_data = onmt.BucketIterator(dataset['train']['src'], dataset['train']['tgt'], dataset['train']['src_rb'], dataset['train']['tgt_rb'], opt, threshold) valid_data = onmt.BucketIterator(dataset['valid']['src'], dataset['valid']['tgt'], dataset['valid']['src_rb'], dataset['valid']['tgt_rb'], opt, threshold) valid_datas = [valid_data] valid_weight = [1.] valid_probability = [1.] train_datas = [forward_data] probability = [1.] weights = [1.] print len(forward_data) else: opt.filter_src_rb = 0 forward_data = onmt.BucketIterator(dataset['train']['src'], dataset['train']['tgt'], dataset['train']['src_rb'], dataset['train']['tgt_rb'], opt, threshold) #print len(forward_data) valid_data = onmt.BucketIterator(dataset['valid']['src'], dataset['valid']['tgt'], dataset['valid']['src_rb'], dataset['valid']['tgt_rb'], opt, threshold) valid_datas = [valid_data] valid_weight = [1.] valid_probability = [1.] train_datas = [forward_data] probability = [1.] weights = [1.] opt.filter_src_rb = 1 forward_data = onmt.BucketIterator(dataset['train']['src'], dataset['train']['tgt'], dataset['train']['src_rb'], dataset['train']['tgt_rb'], opt, threshold) valid_data = onmt.BucketIterator(dataset['valid']['src'], dataset['valid']['tgt'], dataset['valid']['src_rb'], dataset['valid']['tgt_rb'], opt, threshold) valid_datas += [valid_data] valid_weight += [1.] valid_probability += [1.] train_datas += [forward_data] probability += [1.] weights += [1.] opt.filter_src_rb = None if not opt.no_tgt_to_src: backwardData = onmt.BucketIterator(dataset['train_bi']['src'], dataset['train_bi']['tgt'], dataset['train_bi']['src_rb'], dataset['train_bi']['tgt_rb'], opt, threshold) train_datas.append(backwardData) weights.append(1.) probability = [0.5, 0.5] trainData = onmt.mixed_iterator(train_datas, probability) validData = onmt.mixed_iterator(valid_datas, valid_probability) print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') if opt.train_from is None: decoder = onmt.Models.Decoder(opt, dicts['tgt'], attn_type=opt.attn_type) generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.cuda > 1: generator = nn.DataParallel(generator, device_ids=opt.gpus) discriminator = onmt.Models.Discriminator(opt) if not opt.separate_encoder: encoder = onmt.Models.Encoder(opt, dicts['src']) models = [ onmt.Models.NMTModel(encoder, decoder, generator, discriminator, opt) ] else: models = [] for i in range(opt.num_rb_bin): encoder = onmt.Models.Encoder(opt, dicts['src']) models += [ onmt.Models.NMTModel(encoder, decoder, generator, discriminator, opt) ] optims = [] for model_single in models: if opt.cuda > 1: model_single = nn.DataParallel(model_single, device_ids=opt.gpus) if opt.cuda: model_single.cuda() else: model_single.cpu() model_single.generator = generator for p in model_single.get_seq2seq_parameters(): p.data.uniform_(-opt.param_init, opt.param_init) for p in model_single.get_disc_parameters(): if opt.non_linear == "relu": opt.adv_para_init = 2. / opt.disc_size p.data.uniform_(-opt.adv_param_init, opt.adv_param_init) optim_single = onmt.Optim( model_single.parameters(), model_single.get_seq2seq_parameters(), model_single.get_disc_parameters(), model_single.get_encoder_parameters(), opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, adam_momentum=opt.adam_momentum, ) optims += [optim_single] else: print('Loading from checkpoint at %s' % opt.train_from) checkpoint = torch.load(opt.train_from) model_single = checkpoint['model'] if opt.cuda: model_single.cuda() else: model_single.cpu() optim_single = checkpoint['optim'] opt.start_epoch = checkpoint['epoch'] + 1 nParams = sum([ p.nelement() for model_single in models for p in model_single.parameters() ]) print('* number of parameters: %d' % nParams) trainModel(models, trainData, validData, dataset, optims, dicts, weights, valid_weight, threshold) if opt.task == "MT": translate.main([ "-task", opt.task, "-data", opt.data, "-model", "%s/model.pt" % exp_path, "-replace_unk", "-gpus", str(opt.gpus[0]), "-output", "%s/test_no_unk.txt" % exp_path, "-verbose" ]) evaluate_file.main([ "-task", opt.task, "-data", opt.data, "-outputs", "%s/test_no_unk.txt" % exp_path ]) elif opt.task == "Multi-MT": for test_set in ["test"]: for language_pair in dataset["language_pairs"]: line = language_pair.split("-") S_lang = line[0] T_lang = line[1] print "test_set", test_set + "_" + language_pair if opt.filter_src_rb is None or opt.filter_src_rb == dataset[ "src_language_mapping"][S_lang]: translate.main([ "-task", opt.task, "-data", opt.data, "-model", "%s/model.pt" % exp_path, "-replace_unk", "-gpus", str(opt.gpus[0]), "-output", "%s/%s_%s_no_unk.txt" % (exp_path, test_set, language_pair), "-verbose", "-language_pair", language_pair, "-test_set", test_set, "-bpe" ]) evaluate_file.main([ "-task", opt.task, "-data", opt.data, "-outputs", "%s/%s_%s_no_unk.txt" % (exp_path, test_set, language_pair), "-language_pair", language_pair, "-test_set", test_set ]) else: print "BLEU 0.0, SARI 0.00, R1 0.00, R2 0.00, RL 0.00, FK_O 0.0, acc 0.00" else: for i in range(opt.num_rb_bin): translate.main([ "-task", opt.task, "-data", opt.data, "-model", "%s/model.pt" % exp_path, "-replace_unk", "-gpus", str(opt.gpus[0]), "-output", "%s/test_no_unk.txt" % exp_path, "-verbose", "-tgt_rb_all", str(i) ]) evaluate_file.main([ "-task", opt.task, "-data", opt.data, "-outputs", "%s/test_no_unk.txt" % exp_path, "-single_rb", str(i) ]) print "all rb", i translate.main([ "-task", opt.task, "-data", opt.data, "-model", "%s/model.pt" % exp_path, "-replace_unk", "-gpus", str(opt.gpus[0]), "-output", "%s/test_no_unk.txt" % exp_path, "-verbose" ]) evaluate_file.main([ "-task", opt.task, "-data", opt.data, "-outputs", "%s/test_no_unk.txt" % exp_path ])
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Loading Encoder Model ...') enc_check = torch.load(opt.encoder_model, map_location=lambda storage, loc: storage) m_opt = enc_check['opt'] src_dict = enc_check['dicts']['src'] encoder = onmt.Models.Encoder(m_opt, src_dict) encoder.load_state_dict(enc_check['encoder']) print('Loading CNN Classifier Model ...') class_check = torch.load(opt.classifier_model, map_location=lambda storage, loc: storage) class_opt = class_check['opt'] class_dict = class_check['vocabulary'] class_model = emoModel.EmoGRU(class_opt["vocab_inp_size"], class_opt["embedding_dim"], class_opt["units"], opt.batch_size, class_opt["target_size"]) # class_model = onmt.CNNModels.ConvNet(class_opt, class_dict) class_model.load_state_dict(class_check['model']) print('Building model...') decoder = onmt.Models_decoder.Decoder(opt, dicts['tgt']) generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) class_input = nn.Sequential(nn.Linear(opt.rnn_size, class_dict.size())) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) decoder.load_state_dict(checkpoint['decoder']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 model = onmt.Models_decoder.DecoderModel(decoder) if len(opt.gpus) >= 1: encoder.cuda() model.cuda() class_model.cuda() generator.cuda() class_input.cuda() else: encoder.cpu() model.cpu() class_model.cpu() generator.cpu() class_input.cpu() if len(opt.gpus) > 1: encoder = nn.DataParallel(encoder, device_ids=opt.gpus, dim=1) model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) class_input = nn.DataParallel(class_input, device_ids=opt.gpus, dim=0) if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) model.encoder = encoder model.generator = generator model.class_input = class_input model.class_model = class_model if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint) dataset['dicts'] = checkpoint['dicts'] old_opt = checkpoint['opt'] cur_opt = old_opt if dict_checkpoint else opt trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(cur_opt, dicts['src']) decoder = onmt.Models.DecoderWithMultiAttn(cur_opt, dicts['tgt']) # decoder = onmt.Models.Decoder(cur_opt, dicts['tgt']) generator = nn.Sequential(nn.Linear(cur_opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.sync_decode_emb: generator.weight = decoder.word_lut.weight model = onmt.Models.NMTModel(encoder, decoder, cur_opt.attn_use_emb) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = { k: v for k, v in chk_model.state_dict().items() if 'generator' not in k } model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, momentum=opt.momentum, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'][0] if opt.learning_rate != parser.get_default('learning_rate'): optim.setLearningRate(opt.learning_rate) if opt.start_decay_at > parser.get_default('start_decay_at'): optim.setStartDecay(opt.start_decay_at) if opt.optim != optim.method: print "Change optim method", optim.method, ' -> ', opt.optim optim.setMethod(opt.optim) print(optim) optim.set_parameters(model.parameters()) if (opt.train_from or opt.train_from_state_dict) and \ (opt.optim == old_opt.optim): # print old_opt.optim # print checkpoint['optim'][1] optim.optimizer.load_state_dict(checkpoint['optim'][1]) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
def _build_optimizer(self): optimizer = onmt.Optim(self.args) return optimizer
def main(): global opt print("Loading data from '%s'" % opt.data) train = torch.load(opt.data + '.train.pt') fields = onmt.IO.ONMTDataset.load_fields( torch.load(opt.data + '.vocab.pt')) valid = torch.load(opt.data + '.valid.pt') fields = dict([(k, f) for (k, f) in fields.items() if k in train.examples[0].__dict__]) train.fields = fields valid.fields = fields src_features = [fields["src_feat_"+str(j)] for j in range(train.nfeatures)] model_opt = opt checkpoint = None dict_checkpoint = opt.train_from if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) fields = onmt.IO.ONMTDataset.load_fields(checkpoint['vocab']) model_opt = checkpoint["opt"] print(' * vocabulary size. source = %d; target = %d' % (len(fields['src'].vocab), len(fields['tgt'].vocab))) for j, feat in enumerate(src_features): print(' * src feature %d size = %d' % (j, len(feat.vocab))) print(' * number of training sentences. %d' % len(train)) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') model = onmt.Models.make_base_model(opt, model_opt, fields, checkpoint) print(model) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpuid) > 1: print('Multi gpu training ', opt.gpuid) model = nn.DataParallel(model, device_ids=opt.gpuid, dim=1) # generator = nn.DataParallel(generator, device_ids=opt.gpuid, dim=0) if not opt.train_from: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) model.encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc) model.decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt ) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) if opt.train_from: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) optim.set_parameters(model.parameters()) n_params = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % n_params) enc = 0 dec = 0 for name, param in model.named_parameters(): if 'encoder' in name: enc += param.nelement() elif 'decoder' in name: dec += param.nelement() else: print(name, param.nelement()) print('encoder: ', enc) print('decoder: ', dec) check_model_path() train_model(model, train, valid, fields, optim)
def __init__(self, model, loss_function, train_data, valid_data, dicts, opt, setup_optimizer=True): super().__init__(model, loss_function, train_data, valid_data, dicts, opt) if opt.lfv_multilingual: from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy) self.loss_function.add_loss_function(lid_loss, 'lid_loss') self.n_gpus = len(self.opt.gpus) if opt.ctc_loss != 0: from onmt.speech.ctc_loss import CTC self.ctc_loss_function = CTC(dicts['tgt'].size(), opt.model_size, 0.0, reduce=True) if self.cuda: torch.cuda.set_device(self.opt.gpus[0]) if self.opt.seed >= 0: torch.manual_seed(self.opt.seed) self.loss_function = self.loss_function.cuda() self.model = self.model.cuda() if opt.ctc_loss > 0.0: self.ctc_loss_function = self.ctc_loss_function.cuda() if setup_optimizer: self.optim = onmt.Optim(opt) self.optim.set_parameters(self.model.parameters()) if not self.opt.fp16: opt_level = "O0" keep_batchnorm_fp32 = False elif self.opt.fp16_mixed: opt_level = "O1" keep_batchnorm_fp32 = None else: opt_level = "O2" keep_batchnorm_fp32 = False if self.cuda: self.model, self.optim.optimizer = amp.initialize( self.model, self.optim.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 0) # An ugly hack to switch between align right and align left if hasattr(self.model, 'relative'): if self.model.relative: self.train_data.src_align_right = True self.train_data.tgt_align_right = False self.valid_data.src_align_right = True self.valid_data.tgt_align_right = False
def __init__(self, device, train_data, valid_data, dicts, opt, setup_optimizer=True): """ :param model: :param device: int (GPU id) :param loss_function: :param train_data: :param valid_data: :param dicts: :param opt: """ # self.model = model # self.model = model # self.loss_function = loss_function self.device = device opt.node_rank = 0 opt.nodes = 1 self.world_size = len(opt.gpus) # in the case of single node distributed, it should equal self.device self.rank = self.device # make a group to later use with dist.all_reduce self.group = dist.group.WORLD self.print("[INFO] Training Options:", opt) dist.init_process_group(backend='nccl', init_method='env://', world_size=self.world_size, rank=self.rank) self.model = None if self.rank == 0: self.train_data = train_data self.valid_data = valid_data else: self.train_data = copy.deepcopy(train_data) self.valid_data = copy.deepcopy(valid_data) self.dicts = dicts self.opt = opt self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0) assert self.cuda, "[ERROR] Training is only available on GPUs." self.start_time = 0 # setting up models and others if opt.lfv_multilingual: from onmt.models.speech_recognizer.lid_loss import CrossEntropyLIDLoss lid_loss = CrossEntropyLIDLoss(opt.n_languages, opt.label_smoothing, opt.fast_xentropy) self.loss_function.add_loss_function(lid_loss, 'lid_loss') torch.manual_seed(self.opt.seed) # note: we must start creating models after ccreating the processes # for some reason passing a pre-created model to a process creates a "pickle" error if not opt.fusion: if self.is_main(): print("BUILDING MODEL .... ", flush=True) model = build_model(opt, dicts) """ Building the loss function """ if opt.ctc_loss != 0: loss_function = NMTAndCTCLossFunc( dicts['tgt'].size(), label_smoothing=opt.label_smoothing, ctc_weight=opt.ctc_loss) elif opt.nce: from onmt.modules.nce.nce_loss import NCELoss loss_function = NCELoss(opt.model_size, dicts['tgt'].size(), noise_ratio=opt.nce_noise, logz=9, label_smoothing=opt.label_smoothing) else: loss_function = NMTLossFunc( opt.model_size, dicts['tgt'].size(), label_smoothing=opt.label_smoothing, mirror=opt.mirror_loss, fast_xentropy=opt.fast_xentropy) # This function replaces modules with the more optimized counterparts so that it can run faster # Currently exp with LayerNorm if not opt.memory_profiling: # distributed is required to convert BatchNorm to SyncBatchNorm for DDP optimize_model(model, distributed=(self.world_size > 1)) # optimize_model(model) init_model_parameters(model, opt) self.model = model self.loss_function = loss_function if self.cuda: torch.cuda.set_device(self.device) self.loss_function = self.loss_function.cuda(device=self.device) self.model = self.model.cuda(device=self.device) # Ensure that the distributed copies have the same initial parameters # Manual seed may not work the same for different GPU models. if self.world_size > 1: params = [p for p in self.model.parameters()] with torch.no_grad(): if not self.is_main(): for p in params: p.zero_() else: for p in params: p.add_(0) if self.world_size > 1: params = [p for p in self.model.parameters()] all_reduce_and_rescale_tensors(params, 1) if setup_optimizer: self.optim = onmt.Optim(opt) self.optim.set_parameters(self.model.parameters()) if self.is_main(): print("[INFO] Optimizer: ", self.optim.optimizer) if not self.opt.fp16: opt_level = "O0" keep_batchnorm_fp32 = False elif self.opt.fp16_mixed: opt_level = "O1" keep_batchnorm_fp32 = None else: opt_level = "O2" keep_batchnorm_fp32 = False if self.cuda: self.model, self.optim.optimizer = amp.initialize( self.model, self.optim.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 0) # wrap the model into DDP after initializing by amp if self.world_size > 1: """ delay_allreduce is required to avoid allreduce error during backward pass """ self.model = DDP(self.model, delay_allreduce=True, gradient_average=False) # torch DDP is more likely to work with the official amp autocast # self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[self.rank], # output_device=self.rank, # find_unused_parameters=True) print("[INFO] Process %d ready." % self.rank, flush=True)
def main(): print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = opt.train_from if opt.train_from else opt.train_from_state_dict if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) dataset['dicts'] = checkpoint['dicts'] dicts = dataset['dicts'] trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus, sample_size=opt.sample_vocab, tgtVocab_size=dicts['tgt'].size()) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True) print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt']) generator = onmt.Generator(opt, dicts['tgt'].size(), decoder.word_lut, desc=opt.desc) model = onmt.Models.NMTModel(encoder, decoder) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at ) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict(checkpoint['optim'].optimizer.state_dict()) optim.set_parameters(model.parameters()) nParams = sum([p.nelement() for p in model.parameters()]) if opt.generator in ['simple','tie']: print('* number of parameters: %d' % nParams) else: linshape = model.generator.linear.weight.shape not_used = linshape[0]*linshape[1] nParams = nParams - not_used print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, optim)
if len(opt.gpus) >= 1: model.cuda() vocab_dist_gen.cuda() final_dist_gen.cuda() model.vocab_dist_gen = vocab_dist_gen model.final_dist_gen = final_dist_gen if not opt.train_from_state_dict and not opt.train_from: for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.load_pretrained_vectors(opt) decoder.load_pretrained_vectors(opt) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at) optim.set_parameters(model.parameters()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) #trainModel(model, trainData, validData, dataset, optim) criterion = NMTCriterion(dataset['dicts']['tgt'].size()) # shuffle mini batch order #batchOrder = torch.randperm(len(trainData)) total_loss, total_words, total_num_correct = 0, 0, 0 report_loss, report_tgt_words, report_src_words, report_num_correct = 0, 0, 0, 0 batchIdx = 1500
def main(): # Set up the Crayon logging server. if opt.log_server != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.log_server) experiments = cc.get_experiment_names() print(experiments) if opt.experiment_name in experiments: cc.remove_experiment(opt.experiment_name) opt.experiment_name = cc.create_experiment(opt.experiment_name) print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) #dataset['dicts'] = checkpoint['dicts'] if opt.redis: trainData = onmt.RedisDataset("train", opt.batch_size, False, reverse=opt.reverse, port=opt.port, db=opt.db, r2l=opt.r2l) validData = onmt.RedisDataset('valid', opt.batch_size, False, volatile=True, reverse=opt.reverse, port=opt.port, r2l=opt.r2l, db=opt.db) else: trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, False, data_type=dataset.get("type", "text"), srcFeatures=dataset['train'].get('src_features'), tgtFeatures=dataset['train'].get('tgt_features'), alignment=dataset['train'].get('alignments')) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, False, volatile=True, data_type=dataset.get("type", "text"), srcFeatures=dataset['valid'].get('src_features'), tgtFeatures=dataset['valid'].get('tgt_features'), alignment=dataset['valid'].get('alignments')) dicts = dataset['dicts'] if opt.reverse: dicts['src'], dicts['tgt'] = dicts['tgt'], dicts['src'] dicts['src_features'], dicts['tgt_features'] = dicts['tgt_features'], dicts['src_features'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) #if 'src_features' in dicts: # for j in range(len(dicts['src_features'])): # print(' * src feature %d size = %d' % # (j, dicts['src_features'][j].size())) #print(' * number of training sentences. %d' % #len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') if opt.encoder_type == "text": encoder = onmt.Models.Encoder(opt, dicts['src'], dicts.get('src_features', None)) elif opt.encoder_type == "img": encoder = onmt.modules.ImageEncoder(opt) assert("type" not in dataset or dataset["type"] == "img") else: print("Unsupported encoder type %s" % (opt.encoder_type)) decoder = onmt.Models.Decoder(opt, dicts['tgt']) if opt.copy_attn: generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt']) else: generator = nn.Sequential( nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 model.cpu() generator.cpu() model.generator = generator if not opt.train_from_state_dict and not opt.train_from: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc) decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt ) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) print('Multi gpu training ', opt.gpus) trainer = MultiprocessingTrainer(opt, model, optim, device_ids=opt.gpus) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) enc = 0 dec = 0 for name, param in model.named_parameters(): if 'encoder' in name: enc += param.nelement() elif 'decoder' in name: dec += param.nelement() else: print(name, param.nelement()) print('encoder: ', enc) print('decoder: ', dec) trainModel(trainer, trainData, validData, dataset)
report_stats.output(epoch, batch + 1, num_batches, start_time) report_stats = onmt.Statistics() return report_stats if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--config_file', default='default.cfg') args, extra_args = argparser.parse_known_args() opt = Configurable(args.config_file, extra_args) model = ADVModel(opt) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.rnn_size) optim.set_parameters(model.named_parameters()) tgt_vocab = Vocab(opt.tgt_vocab) loss_compute = onmt.Loss.NMTLossCompute(model.generator, tgt_vocab).cuda() trainer = onmt.Trainer(model, loss_compute, loss_compute, optim) train_set = Data_Loader(opt.train_file, opt.batch_size) valid_set = Data_Loader(opt.dev_file, opt.batch_size) for epoch in xrange(opt.max_epoch): train_stats = trainer.train(train_set, epoch, report_func) print('Train perplexity: %g' % train_stats.ppl()) print('Train accuracy: %g' % train_stats.accuracy())
def build_optim(model, checkpoint): saved_optimizer_state_dict = None if opt.train_from and opt.train_part is None: #!= "context": print('Loading optimizer from checkpoint.') optim = checkpoint['optim'] # We need to save a copy of optim.optimizer.state_dict() for setting # the, optimizer state later on in Stage 2 in this method, since # the method optim.set_parameters(model.parameters()) will overwrite # optim.optimizer, and with ith the values stored in # optim.optimizer.state_dict() saved_optimizer_state_dict = optim.optimizer.state_dict() else: print('Making optimizer for training.') optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.rnn_size) # Stage 1: # Essentially optim.set_parameters (re-)creates and optimizer using # model.paramters() as parameters that will be stored in the # optim.optimizer.param_groups field of the torch optimizer class. # Importantly, this method does not yet load the optimizer state, as # essentially it builds a new optimizer with empty optimizer state and # parameters from the model. optim.set_parameters(model.named_parameters()) print("Stage 1: Keys after executing optim.set_parameters" + "(model.parameters())") show_optimizer_state(optim) if opt.train_from and opt.train_part is None: # != "context": # Stage 2: In this stage, which is only performed when loading an # optimizer from a checkpoint, we load the saved_optimizer_state_dict # into the re-created optimizer, to set the optim.optimizer.state # field, which was previously empty. For this, we use the optimizer # state saved in the "saved_optimizer_state_dict" variable for # this purpose. # See also: https://github.com/pytorch/pytorch/issues/2830 optim.optimizer.load_state_dict(saved_optimizer_state_dict) # Convert back the state values to cuda type if applicable if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print( "Stage 2: Keys after executing optim.optimizer.load_state_dict" + "(saved_optimizer_state_dict)") show_optimizer_state(optim) # We want to make sure that indeed we have a non-empty optimizer state # when we loaded an existing model. This should be at least the case # for Adam, which saves "exp_avg" and "exp_avg_sq" state # (Exponential moving average of gradient and squared gradient values) if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def main(): print("Loading data from '%s'" % opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if opt.data_type == 'h5': alignments = torch.load(opt.data_alignment) #alignments = None dicts = torch.load(opt.dict)['dicts'] dataset = h5py.File(opt.data) trainData = onmt.Dataset_h5( dataset, 'train', opt.batch_size, opt.gpus, data_type="text", srcFeatures=None, tgtFeatures=None, alignment=alignments['train'] if alignments else None) validData = onmt.Dataset_h5( dataset, 'valid', opt.batch_size, opt.gpus, volatile=True, data_type="text", srcFeatures=None, tgtFeatures=None, alignment=alignments['valid'] if alignments else None) print(' ***************************************************') print(' *** vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' *** number of training sentences. %d' % dataset['train_src_label'].shape[0]) print(' *** maximum batch size. %d' % opt.batch_size) print(' *** maximum number of batch size. %d ' % len(trainData)) else: dataset = torch.load(opt.data, map_location=lambda storage, loc: storage) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) dataset['dicts'] = checkpoint['dicts'] trainData = onmt.Dataset( dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, opt.gpus, data_type=dataset.get("type", "text"), srcFeatures=dataset['train'].get('src_features'), tgtFeatures=dataset['train'].get('tgt_features'), alignment=dataset['train'].get('alignments')) validData = onmt.Dataset( dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, opt.gpus, volatile=True, data_type=dataset.get("type", "text"), srcFeatures=dataset['valid'].get('src_features'), tgtFeatures=dataset['valid'].get('tgt_features'), alignment=dataset['valid'].get('alignments')) dicts = dataset['dicts'] print(' ***************************************************') print(' *** vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) print(' *** number of training sentences. %d' % len(dataset['train']['src'])) print(' *** maximum batch size. %d' % opt.batch_size) if 'src_features' in dicts: for j in range(len(dicts['src_features'])): print(' * src feature %d size = %d' % (j, dicts['src_features'][j].size())) print('Building model...') if opt.encoder_type == "text": encoder = onmt.Models.Encoder(opt, dicts['src'], dicts.get('src_features', None)) elif opt.encoder_type == "img": encoder = onmt.modules.ImageEncoder(opt) assert ("type" not in dataset or dataset["type"] == "img") else: print("Unsupported encoder type %s" % (opt.encoder_type)) decoder = onmt.Models.Decoder(opt, dicts['tgt']) model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1) if opt.copy_attn: generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt']) else: generator = nn.Sequential(nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) model_state_dict = checkpoint['model'] generator_state_dict = checkpoint['generator'] model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 if len(opt.gpus) >= 1: model.cuda() generator.cuda() else: model.cpu() generator.cpu() if len(opt.gpus) > 1: print('Multi gpu training ', opt.gpus) model = nn.DataParallel(model, device_ids=opt.gpus, dim=1) generator = nn.DataParallel(generator, device_ids=opt.gpus, dim=0) model.generator = generator if not opt.train_from_state_dict and not opt.train_from: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) # if p.data.dim()>1: # init.xavier_uniform(p.data) # else: # init.constant(p.data, 0.0) encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc) decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec) optim = onmt.Optim(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) optim.set_parameters(model.parameters()) if opt.guided_fertility: print('Getting fertilities from external alignments..') fert_dict = evaluation.get_fert_dict(opt.guided_fertility, opt.guided_fertility_source_file, dicts["src"]) else: fert_dict = None if opt.supervised_fertility: print("Retrieving fertilities for all training sentences...") fert_sents = evaluation.get_fertility( opt.supervised_fertility, opt.supervised_fertility_source_file, dicts["src"]) else: fert_sents = None if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) trainModel(model, trainData, validData, dataset, dicts, optim, fert_dict, fert_sents)
def __init__(self, model, lat_dis, loss_function, train_data, valid_data, dicts, opt, setup_optimizer=True): self.train_data = train_data self.valid_data = valid_data self.dicts = dicts self.opt = opt self.cuda = (len(opt.gpus) >= 1 and opt.gpus[0] >= 0) self.start_time = 0 self.n_gpus = len(self.opt.gpus) self.loss_function_ae, self.loss_lat_dis = loss_function self.model_ae = model self.lat_dis = lat_dis if self.cuda: torch.cuda.set_device(self.opt.gpus[0]) if self.opt.seed >= 0: torch.manual_seed(self.opt.seed) self.loss_function_ae = self.loss_function_ae.cuda() self.model_ae = self.model_ae.cuda() self.lat_dis = self.lat_dis.cuda() self.loss_lat_dis = self.loss_lat_dis.cuda() if setup_optimizer: self.optim_ae = onmt.Optim(opt) self.optim_ae.set_parameters(self.model_ae.parameters()) lat_opt = copy.deepcopy(opt) lat_opt.beta1 = 0.5 # lat_opt.learning_rate = 0.0002 # lat_opt.update_method = 'regular' self.optim_lat_dis = onmt.Optim(lat_opt) self.optim_lat_dis.set_parameters(self.lat_dis.parameters()) if not self.opt.fp16: opt_level = "O0" keep_batchnorm_fp32 = False elif self.opt.fp16_mixed: opt_level = "O1" keep_batchnorm_fp32 = None else: opt_level = "O2" keep_batchnorm_fp32 = False if self.cuda: # print(234) self.model_ae, self.optim_ae.optimizer = amp.initialize( self.model_ae, self.optim_ae.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 0) self.lat_dis, self.optim_lat_dis.optimizer = amp.initialize( self.lat_dis, self.optim_lat_dis.optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic", verbosity=1 if self.opt.verbose else 0)
args.cuda = not args.no_cuda and torch.cuda.is_available() with open(args.vocab_file) as f: args.vocab_size = len(f.readlines()) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) model = MultiChoiceQAModel(args) if args.cuda: model = model.cuda() optimizer = onmt.Optim(args.optim, args.lr, args.max_grad_norm, lr_decay=args.lr_decay, start_decay_at=args.start_decay_at) optimizer.set_parameters(model.parameters()) datasets = [] for f in glob.glob( "/data/users/iLikeNLP/AIContest/ChatBotCourse/subtitle/preprocess/chinese/*.srt" ): datasets.append(GenLCData(f, args.vocab_file)) train_loader = ConcatData(datasets) #train_loader = LocallyShuffleData(train_loader, args.batch_size*100) train_loader = BatchData(train_loader, args.batch_size) train_data = train_loader.get_data() valid_loader = GenLCData(args.valid_file, args.vocab_file) valid_loader = BatchData(valid_loader, args.batch_size)