def __init__(self, bert_model, dataset, mapping, discriminator, args, bert_model1=None): """ Initialize trainer script. """ self.bert_model = bert_model self.bert_model1 = bert_model1 if args.adversarial: self.dataset = dataset #sampler = SequentialSampler(dataset) sampler = RandomSampler(dataset) self.dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size) self.iter_loader = _DataLoaderIter(self.dataloader) self.mapping = mapping self.discriminator = discriminator self.args = args if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") else: self.device = torch.device("cuda", self.args.local_rank) # optimizers if hasattr(args, 'map_optimizer'): optim_fn, optim_args = get_optimizer(args.map_optimizer) self.map_optimizer = optim_fn(mapping.parameters(), **optim_args) if hasattr(args, 'dis_optimizer'): optim_fn, optim_args = get_optimizer(args.dis_optimizer) self.dis_optimizer = optim_fn(discriminator.parameters(), **optim_args) else: assert discriminator is None # best validation score self.best_valid_metric = -1e12 self.decrease_lr = False self.decrease_dis_lr = False
def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111, cudaEfficient=False): super(self.__class__, self).__init__(inputdim, nclasses, l2reg, batch_size, seed, cudaEfficient) """ PARAMETERS: -nhid: number of hidden units (0: Logistic Regression) -optim: optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..) -tenacity: how many times dev acc does not increase before stopping -epoch_size: each epoch corresponds to epoch_size pass on the train set -max_epoch: max number of epoches -dropout: dropout for MLP """ self.nhid = 0 if "nhid" not in params else params["nhid"] self.optim = "adam" if "optim" not in params else params["optim"] self.tenacity = 5 if "tenacity" not in params else params["tenacity"] self.epoch_size = 4 if "epoch_size" not in params else params[ "epoch_size"] self.max_epoch = 200 if "max_epoch" not in params else params[ "max_epoch"] self.dropout = 0. if "dropout" not in params else params["dropout"] self.batch_size = 64 if "batch_size" not in params else params[ "batch_size"] if params["nhid"] == 0: self.model = nn.Sequential(nn.Linear(self.inputdim, self.nclasses), ).cuda() else: self.model = nn.Sequential( nn.Linear(self.inputdim, params["nhid"]), nn.Dropout(p=self.dropout), nn.Sigmoid(), nn.Linear(params["nhid"], self.nclasses), ).cuda() self.loss_fn = nn.CrossEntropyLoss().cuda() self.loss_fn.size_average = False optim_fn, optim_params = utils.get_optimizer(self.optim) self.optimizer = optim_fn(self.model.parameters(), **optim_params) self.optimizer.param_groups[0]['weight_decay'] = self.l2reg
def __init__(self, encoder, decoder, data, test_data, params, num_updates): self.encoder = encoder self.decoder = decoder self.data = data self.test_data = test_data self.params = params self.enc_dec_params = list(self.encoder.parameters()) + list( self.decoder.parameters()) # optimizers self.optimizer = get_optimizer(self.enc_dec_params, self.params.optim) self.optimizer._num_updates = num_updates # training statistics self.epoch = getattr(params, 'now_epoch', 0) self.n_iter = 0 self.oom = 0 self.n_sentences = 0 self.stats = {'processed_s': 0, 'processed_w': 0, 'loss': []} self.sample_sizes = []
def __init__(self, train_dataset, val_dataset, test_dataset, model, hyper_dict, experiment_name, device, cross_validation=False): self.train_dataset = train_dataset self.val_dataset = val_dataset self.test_dataset = test_dataset self.handler = LockableModelSaveHandler(self) self.model = model self.best_model = copy.deepcopy(model) self.best_val_loss = None self.epochs = hyper_dict['epochs'] self.batch_size = hyper_dict['batch_size'] self.num_workers = hyper_dict['num_workers'] self.hyper_dict = hyper_dict self.experiment_name = experiment_name self.device = device self.cross_validation = cross_validation key_lst = ['time'] for split in ('train', 'val', 'test'): for metric in ('loss', 'acc'): key_lst.append(f"{split}_{metric}") self.avg_meter = {key: AverageMeter() for key in key_lst} self.tag_str = {key: "" for key in key_lst} self.train_ldr = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True) self.val_ldr = DataLoader(val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False) self.test_ldr = DataLoader(test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False) self.optimizer = get_optimizer(model.parameters(), hyper_dict) # state variables self.current_iter = 0
def __init__(self, bert_model, mapping, args, bert_model1=None, trans_types=[ 'self_attention', 'attention', 'linear_self_attention', 'nonlinear_self_attention' ]): """ Initialize trainer script. """ self.transformer_types = trans_types self.args = args self.bert_model = bert_model self.bert_model1 = bert_model1 self.mapping = mapping if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") else: self.device = torch.device("cuda", self.args.local_rank) # optimizers if hasattr(args, 'map_optimizer'): optim_fn, optim_args = get_optimizer(args.map_optimizer) if self.args.map_type == 'fine_tune': self.map_optimizer = optim_fn(bert_model.parameters(), **optim_args) else: self.map_optimizer = optim_fn(mapping.parameters(), **optim_args) # best validation score self.best_valid_metric = -1e12 self.decrease_lr = False
def pretrain(cfg): print(cfg.pretty()) pretrain_config_validator(cfg) fix_seed(cfg.seed) controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) models = {'trunk': controller} dataset = get_dataset(seed=cfg.seed, **cfg.dataset) optimizers = { 'trunk_optimizer': get_optimizer(parameters=models['trunk'].parameters(), **cfg.optimizer) } lr_schedulers = { 'trunk_scheduler_by_iteration': get_scheduler(optimizer=optimizers['trunk_optimizer'], **cfg.scheduler) } loss_funcs = { 'reconstruction_loss': torch.nn.NLLLoss(), 'metric_loss': get_loss(**cfg.loss) } mining_funcs = {"tuple_miner": get_miner(**cfg.miner)} visualizers = [umap.UMAP(**params) for params in cfg.visualizers] end_of_iteration_hook = TensorboardHook(visualizers).end_of_iteration_hook end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( models=models, optimizers=optimizers, lr_schedulers=lr_schedulers, loss_funcs=loss_funcs, mining_funcs=mining_funcs, dataset=dataset, end_of_iteration_hook=end_of_iteration_hook, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
def train(cfg): print(cfg.pretty()) train_config_validator(cfg) fix_seed(cfg.seed) writer = SummaryWriter(log_dir='logs') controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) dataset = get_dataset(writer=writer, seed=cfg.seed, **cfg.dataset) optimizer = get_optimizer(parameters=_get_target_parameters( controller, cfg.freeze_encoder_decoder), **cfg.optimizer) lr_scheduler = get_scheduler(optimizer=optimizer, **cfg.scheduler) end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( controller=controller, dataset=dataset, optimizer=optimizer, lr_scheduler=lr_scheduler, writer=writer, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
def setup_data_and_model(params, model): # Variables that may not otherwise be assigned writer = perturbation_loader = generator = training_smiles = None # setup random seeds if params.val_seed is None: params.val_seed = params.seed set_seed_if(params.seed) exp_path = os.path.join(params.dump_path, params.exp_name) # create exp path if it doesn't exist if not os.path.exists(exp_path): os.makedirs(exp_path) # create logger logger = create_logger(os.path.join(exp_path, 'train.log'), 0) pp = pprint.PrettyPrinter() logger.info("============ Initialized logger ============") logger.info("Random seed is {}".format(params.seed)) if params.suppress_params is False: logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(params)).items()))) logger.info("Running command: %s" % 'python ' + ' '.join(sys.argv)) logger.info("The experiment will be stored in %s\n" % exp_path) logger.info("") # load data train_data, val_dataset, train_loader, val_loader = load_graph_data(params) logger.info ('train_loader len is {}'.format(len(train_loader))) logger.info ('val_loader len is {}'.format(len(val_loader))) if params.num_binary_graph_properties > 0 and params.pretrained_property_embeddings_path: model.binary_graph_property_embedding_layer.weight.data = \ torch.Tensor(np.load(params.pretrained_property_embeddings_path).T) if params.load_latest is True: load_prefix = 'latest' elif params.load_best is True: load_prefix = 'best' else: load_prefix = None if load_prefix is not None: if params.local_cpu is True: model.load_state_dict(torch.load(os.path.join(exp_path, '{}_model'.format(load_prefix)), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(exp_path, '{}_model'.format(load_prefix)))) if params.local_cpu is False: model = model.cuda() if params.gen_num_samples > 0: generator = GraphGenerator(train_data, model, params.gen_random_init, params.gen_num_iters, params.gen_predict_deterministically, params.local_cpu) with open(params.smiles_path) as f: smiles = f.read().split('\n') training_smiles = smiles[:int(params.smiles_train_split * len(smiles))] del smiles opt = get_optimizer(model.parameters(), params.optimizer) if load_prefix is not None: opt.load_state_dict(torch.load(os.path.join(exp_path, '{}_opt_sd'.format(load_prefix)))) lr = opt.param_groups[0]['lr'] lr_lambda = lambda iteration: lr_decay_multiplier(iteration, params.warm_up_iters, params.decay_start_iter, params.lr_decay_amount, params.lr_decay_frac, params.lr_decay_interval, params.min_lr, lr) scheduler = LambdaLR(opt, lr_lambda) index_method = get_index_method() best_loss = 9999 if params.tensorboard: from tensorboardX import SummaryWriter writer = SummaryWriter(exp_path) total_iter, grad_accum_iters = params.first_iter, 0 return params, model, opt, scheduler, train_data, train_loader, val_dataset, val_loader, perturbation_loader,\ generator, index_method, exp_path, training_smiles, pp, logger, writer, best_loss, total_iter,\ grad_accum_iters
def main(params): # setup random seeds set_seed(params.seed) params.ar = True exp_path = os.path.join(params.dump_path, params.exp_name) # create exp path if it doesn't exist if not os.path.exists(exp_path): os.makedirs(exp_path) # create logger logger = create_logger(os.path.join(exp_path, 'train.log'), 0) logger.info("============ Initialized logger ============") logger.info("Random seed is {}".format(params.seed)) logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(params)).items()))) logger.info("The experiment will be stored in %s\n" % exp_path) logger.info("Running command: %s" % 'python ' + ' '.join(sys.argv)) logger.info("") # load data data, loader = load_smiles_data(params) if params.data_type == 'ChEMBL': all_smiles_mols = open(os.path.join(params.data_path, 'guacamol_v1_all.smiles'), 'r').readlines() else: all_smiles_mols = open(os.path.join(params.data_path, 'QM9_all.smiles'), 'r').readlines() train_data, val_data = data['train'], data['valid'] dico = data['dico'] logger.info ('train_data len is {}'.format(len(train_data))) logger.info ('val_data len is {}'.format(len(val_data))) # keep cycling through train_loader forever # stop when max iters is reached def rcycle(iterable): saved = [] # In-memory cache for element in iterable: yield element saved.append(element) while saved: random.shuffle(saved) # Shuffle every batch for element in saved: yield element train_loader = rcycle(train_data.get_iterator(shuffle=True, group_by_size=True, n_sentences=-1)) # extra param names for transformermodel params.n_langs = 1 # build Transformer model model = TransformerModel(params, is_encoder=False, with_output=True) if params.local_cpu is False: model = model.cuda() opt = get_optimizer(model.parameters(), params.optimizer) scores = {'ppl': np.float('inf'), 'acc': 0} if params.load_path: reloaded_iter, scores = load_model(params, model, opt, logger) for total_iter, train_batch in enumerate(train_loader): if params.load_path is not None: total_iter += reloaded_iter + 1 epoch = total_iter // params.epoch_size if total_iter == params.max_steps: logger.info("============ Done training ... ============") break elif total_iter % params.epoch_size == 0: logger.info("============ Starting epoch %i ... ============" % epoch) model.train() opt.zero_grad() train_loss = calculate_loss(model, train_batch, params) train_loss.backward() if params.clip_grad_norm > 0: clip_grad_norm_(model.parameters(), params.clip_grad_norm) opt.step() if total_iter % params.print_after == 0: logger.info("Step {} ; Loss = {}".format(total_iter, train_loss)) if total_iter > 0 and total_iter % params.epoch_size == (params.epoch_size - 1): # run eval step (calculate validation loss) model.eval() n_chars = 0 xe_loss = 0 n_valid = 0 logger.info("============ Evaluating ... ============") val_loader = val_data.get_iterator(shuffle=True) for val_iter, val_batch in enumerate(val_loader): with torch.no_grad(): val_scores, val_loss, val_y = calculate_loss(model, val_batch, params, get_scores=True) # update stats n_chars += val_y.size(0) xe_loss += val_loss.item() * len(val_y) n_valid += (val_scores.max(1)[1] == val_y).sum().item() ppl = np.exp(xe_loss / n_chars) acc = 100. * n_valid / n_chars logger.info("Acc={}, PPL={}".format(acc, ppl)) if acc > scores['acc']: scores['acc'] = acc scores['ppl'] = ppl save_model(params, data, model, opt, dico, logger, 'best_model', epoch, total_iter, scores) logger.info('Saving new best_model {}'.format(epoch)) logger.info("Best Acc={}, PPL={}".format(scores['acc'], scores['ppl'])) logger.info("============ Generating ... ============") number_samples = 100 gen_smiles = generate_smiles(params, model, dico, number_samples) generator = ARMockGenerator(gen_smiles) try: benchmark = ValidityBenchmark(number_samples=number_samples) validity_score = benchmark.assess_model(generator).score except: validity_score = -1 try: benchmark = UniquenessBenchmark(number_samples=number_samples) uniqueness_score = benchmark.assess_model(generator).score except: uniqueness_score = -1 try: benchmark = KLDivBenchmark(number_samples=number_samples, training_set=all_smiles_mols) kldiv_score = benchmark.assess_model(generator).score except: kldiv_score = -1 logger.info('Validity Score={}, Uniqueness Score={}, KlDiv Score={}'.format(validity_score, uniqueness_score, kldiv_score)) save_model(params, data, model, opt, dico, logger, 'model', epoch, total_iter, {'ppl': ppl, 'acc': acc})
assert len(params.name.strip()) > 0 assert not params.reload or os.path.isfile(params.reload) # initialize experiment / load dataset logger = initialize_exp(params) data, attributes = load_images(params) train_data = DataSampler(data[0], attributes[0], params) valid_data = DataSampler(data[1], attributes[1], params) test_data = DataSampler(data[2], attributes[2], params) # build the model / reload / optimizer classifier = Classifier(params).cuda() if params.reload: reload_model(classifier, params.reload, ['img_sz', 'img_fm', 'init_fm', 'hid_dim', 'attr', 'n_attr']) optimizer = get_optimizer(classifier, params.optimizer) def save_model(name): """ Save the model. """ path = os.path.join(params.dump_path, '%s.pth' % name) logger.info('Saving the classifier to %s ...' % path) torch.save(classifier, path) # best accuracy best_accu = -1e12 for n_epoch in range(params.n_epochs):
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) training_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": True, "collate_fn": custom_collate_fn } test_params = { "batch_size": opt.batch_size, "shuffle": False, "drop_last": False, "collate_fn": custom_collate_fn } training_set = VOCDataset(opt.data_path, opt.dataset, opt.image_size) training_generator = DataLoader(training_set, **training_params) test_set = VOCDataset(opt.data_path, opt.dataset, opt.image_size, is_training=False) test_generator = DataLoader(test_set, **test_params) model = Deeplab(num_classes=training_set.num_classes + 1) #model.load_state_dict(torch.load(opt.pre_trained_model)) log_path = os.path.join(opt.log_path, "{}".format(opt.dataset)) if os.path.isdir(log_path): shutil.rmtree(log_path) #os.makedirs(log_path) writer = SummaryWriter(log_path) writer.add_graph( model, torch.rand(opt.batch_size, 3, opt.image_size, opt.image_size)) if torch.cuda.is_available(): model.cuda() best_loss = 1e10 best_epoch = 0 model.train() num_iter_per_epoch = len(training_generator) for epoch in range(opt.num_epoches): for iter, batch in enumerate(training_generator): current_step = epoch * num_iter_per_epoch + iter current_lr = update_lr(opt.lr, current_step, num_iter_per_epoch * opt.num_epoches) optimizer = get_optimizer(model, current_lr, opt.momentum, opt.decay) if torch.cuda.is_available(): batch = [torch.Tensor(record).cuda() for record in batch] else: batch = [torch.Tensor(record) for record in batch] image, gt1, gt2 = batch gt1 = gt1.long() gt2 = gt2.long() optimizer.zero_grad() results = model(image) mul_losses = multiple_losses(results, [gt1, gt1, gt2, gt1]) mul_losses[4].backward() optimizer.step() print( "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})" .format(epoch + 1, opt.num_epoches, iter + 1, num_iter_per_epoch, optimizer.param_groups[0]['lr'], mul_losses[4], mul_losses[0], mul_losses[1], mul_losses[2], mul_losses[3])) writer.add_scalar('Train/Total_loss', mul_losses[4], current_step) writer.add_scalar('Train/1x_scale_loss', mul_losses[0], current_step) writer.add_scalar('Train/0.75x_scale_loss', mul_losses[1], current_step) writer.add_scalar('Train/0.5x_scale_loss', mul_losses[2], current_step) writer.add_scalar('Train/Max_merged_loss', mul_losses[3], current_step) if epoch % opt.test_interval == 0: model.eval() loss_ls = [] loss_scale_1_ls = [] loss_scale_2_ls = [] loss_scale_3_ls = [] loss_max_merged_ls = [] for te_batch in test_generator: if torch.cuda.is_available(): te_batch = [ torch.Tensor(record).cuda() for record in te_batch ] else: te_batch = [torch.Tensor(record) for record in te_batch] te_image, te_gt1, te_gt2 = te_batch te_gt1 = te_gt1.long() te_gt2 = te_gt2.long() num_sample = len(te_gt1) with torch.no_grad(): te_results = model(te_image) te_mul_losses = multiple_losses( te_results, [te_gt1, te_gt1, te_gt2, te_gt1]) loss_ls.append(te_mul_losses[4] * num_sample) loss_scale_1_ls.append(te_mul_losses[0] * num_sample) loss_scale_2_ls.append(te_mul_losses[1] * num_sample) loss_scale_3_ls.append(te_mul_losses[2] * num_sample) loss_max_merged_ls.append(te_mul_losses[3] * num_sample) te_loss = sum(loss_ls) / test_set.__len__() te_scale_1_loss = sum(loss_scale_1_ls) / test_set.__len__() te_scale_2_loss = sum(loss_scale_2_ls) / test_set.__len__() te_scale_3_loss = sum(loss_scale_3_ls) / test_set.__len__() te_max_merged_loss = sum(loss_max_merged_ls) / test_set.__len__() print( "Epoch: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})" .format(epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'], te_loss, te_scale_1_loss, te_scale_2_loss, te_scale_3_loss, te_max_merged_loss)) writer.add_scalar('Test/Total_loss', te_loss, epoch) writer.add_scalar('Test/1x_scale_loss', te_scale_1_loss, epoch) writer.add_scalar('Test/0.75x_scale_loss', te_scale_2_loss, epoch) writer.add_scalar('Test/0.5x_scale_loss', te_scale_3_loss, epoch) writer.add_scalar('Test/Max_merged_loss', te_max_merged_loss, epoch) model.train() if te_loss + opt.es_min_delta < best_loss: best_loss = te_loss best_epoch = epoch torch.save( model.state_dict(), opt.saved_path + os.sep + "only_params_trained_deeplab_voc") torch.save( model, opt.saved_path + os.sep + "whole_model_trained_deeplab_voc") # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( "Stop training at epoch {}. The lowest loss achieved is {}" .format(epoch, te_loss)) break writer.close()
def main(params): logger = initialize_exp(params) if params.img_list is None: params.img_paths = [s.strip() for s in params.img_paths.split(",")] else: assert ":" in params.img_paths chunks = params.img_paths.split(":") assert len(chunks) == 2 n_start, n_end = int(chunks[0]), int(chunks[1]) img_list = torch.load(params.img_list) params.img_paths = [img_list[i] for i in range(n_start, n_end)] print("Image paths", params.img_paths) # Build model / cuda ckpt = torch.load(params.marking_network) params.num_classes = ckpt["params"]["num_classes"] params.architecture = ckpt['params']['architecture'] print("Building %s model ..." % params.architecture) model = build_model(params) model.cuda() model.load_state_dict( {k.replace("module.", ""): v for k, v in ckpt['model'].items()}, strict=False) model = model.eval() model.fc = nn.Sequential() loader = default_loader transform = getImagenetTransform("none", img_size=params.img_size, crop_size=params.crop_size) img_orig = [transform(loader(p)).unsqueeze(0) for p in params.img_paths] # Loading carriers direction = torch.load(params.carrier_path).cuda() assert direction.dim() == 2 direction = direction[params.carrier_id:params.carrier_id + 1] rho = -1 if params.angle is not None: rho = 1 + np.tan(params.angle)**2 img = [x.clone() for x in img_orig] # Load differentiable data augmentations center_da = CenterCrop(params.img_size, params.crop_size) random_da = RandomResizedCropFlip(params.crop_size) if params.data_augmentation == "center": data_augmentation = center_da elif params.data_augmentation == "random": data_augmentation = random_da for i in range(len(img)): img[i].requires_grad = True optimizer, schedule = get_optimizer(img, params.optimizer) if schedule is not None: schedule = repeat_to(schedule, params.epochs) img_center = torch.cat( [center_da(x, 0).cuda(non_blocking=True) for x in img_orig], dim=0) # ft_orig = model(center_da(img_orig, 0).cuda(non_blocking=True)).detach() ft_orig = model(img_center).detach() if params.angle is not None: ft_orig = torch.load( "/checkpoint/asablayrolles/radioactive_data/imagenet_ckpt_2/features/valid_resnet18_center.pth" ).cuda() for iteration in range(params.epochs): if schedule is not None: lr = schedule[iteration] logger.info("New learning rate for %f" % lr) for param_group in optimizer.param_groups: param_group['lr'] = lr # Differentially augment images batch = [] for x in img: aug_params = data_augmentation.sample_params(x) aug_img = data_augmentation(x, aug_params) batch.append(aug_img.cuda(non_blocking=True)) batch = torch.cat(batch, dim=0) # Forward augmented images ft = model(batch) if params.angle is None: loss_ft = -torch.sum((ft - ft_orig) * direction) loss_ft_l2 = params.lambda_ft_l2 * torch.norm(ft - ft_orig, dim=1).sum() else: dot_product = torch.sum((ft - ft_orig) * direction) print("Dot product: ", dot_product.item()) if params.half_cone: loss_ft = -rho * dot_product * torch.abs(dot_product) else: loss_ft = -rho * (dot_product**2) loss_ft_l2 = torch.norm(ft - ft_orig)**2 loss_norm = 0 for i in range(len(img)): loss_norm += params.lambda_l2_img * torch.norm(img[i].cuda( non_blocking=True) - img_orig[i].cuda(non_blocking=True))**2 loss = loss_ft + loss_norm + loss_ft_l2 optimizer.zero_grad() loss.backward() optimizer.step() logs = { "keyword": "iteration", "loss": loss.item(), "loss_ft": loss_ft.item(), "loss_norm": loss_norm.item(), "loss_ft_l2": loss_ft_l2.item(), } if params.angle is not None: logs["R"] = -(loss_ft + loss_ft_l2).item() if schedule is not None: logs["lr"] = schedule[iteration] logger.info("__log__:%s" % json.dumps(logs)) for i in range(len(img)): img[i].data[0] = project_linf(img[i].data[0], img_orig[i][0], params.radius) if iteration % 10 == 0: img[i].data[0] = roundPixel(img[i].data[0]) img_new = [numpyPixel(x.data[0]).astype(np.float32) for x in img] img_old = [numpyPixel(x[0]).astype(np.float32) for x in img_orig] img_totest = torch.cat( [center_da(x, 0).cuda(non_blocking=True) for x in img]) with torch.no_grad(): ft_new = model(img_totest) logger.info("__log__:%s" % json.dumps({ "keyword": "final", "psnr": np.mean( [psnr(x_new - x_old) for x_new, x_old in zip(img_new, img_old)]), "ft_direction": torch.mv(ft_new - ft_orig, direction[0]).mean().item(), "ft_norm": torch.norm(ft_new - ft_orig, dim=1).mean().item(), "rho": rho, "R": (rho * torch.dot(ft_new[0] - ft_orig[0], direction[0])**2 - torch.norm(ft_new - ft_orig)**2).item(), })) for i in range(len(img)): img_name = basename(params.img_paths[i]) extension = ".%s" % (img_name.split(".")[-1]) np.save( join(params.dump_path, img_name).replace(extension, ".npy"), img_new[i].astype(np.uint8))
imv = mean0(train_dataset.y) if args.ensemble_models is None: model = Model(args, vocab, imv) else: model_name = args.ensemble_models model = EnsembleModel(model_name, args.ensemble_method) if args.cuda: model.cuda() model = torch.nn.DataParallel(model) print('Model is on GPU') torch.save(model, model_save) optimizable_parameters = model.parameters() loss_fn = F.mse_loss if args.loss == 'mse' else F.l1_loss optimizer = U.get_optimizer(args, optimizable_parameters) lcount = 0 model.train() for epoch in range(args.epochs): losses = [] batch_idx = -1 # pdb.set_trace() loader = ASAPDataLoader(train_dataset, train_dataset.maxlen, args.batch_size) for xs, ys, ps, padding_mask, lens, (lhs, rhs) in loader: batch_idx += 1 print('Starting batch %d' % batch_idx) if args.pos: indexes = train_dataset.tags_x[lhs:rhs] else: indexes = None
def configure_optimizers(self): self.optimizer = get_optimizer(self.hparams, self.models) scheduler = get_scheduler(self.hparams, self.optimizer) return [self.optimizer], [scheduler]
def run(args): save_args(args, with_tensorboard=True) tokenizer = BartTokenizer.from_pretrained("facebook/bart-large") tokenizer.bos_token = BOS_OUTPUT_TOKEN # For decoding specifically train_dataset, eval_dataset, test_dataset = [ SequentialJSONIterableDataset( os.path.join(args.datadir, f"{split}_*.clf.jsonl"), args=args, process_lines=False, reservoir_shuffle=shuffle, repeat=repeat, reservoir_size=args.reservoir_shuffle_size, ) for (split, shuffle, repeat) in [ ("train", True, True), ("valid", False, True), ("test", False, False), ] ] # Multiple inputs. Use Informativeness if args.input_type == "all": # ControlCode or generic Bart model = MultiInputBartForConditionalGeneration.from_pretrained( args.model_name_or_path ) # MultiHead if args.use_multi_head: model = MultiInputMultiHeadBartForConditionalGeneration.from_pretrained_multi( args.model_name_or_path ) elif args.use_multi_head: # MultiHead model = MultiHeadBartForConditionalGeneration.from_pretrained_multi( args.model_name_or_path ) else: # ControlCode or generic Bart model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path) # Set special token IDs for eval function model.config.decoder_start_token_id = tokenizer.bos_token_id model.config.pad_token_id = ( tokenizer.pad_token_id ) # Might not be necessary, but idk if args.cuda: model = model.to("cuda") if args.distributed: model = utils_dist.wrap(model, args) optimizer = get_optimizer(args, model) if args.use_apex and HAS_APEX: model, optimizer = amp.initialize(model, optimizer, opt_level="O2") collate_fn_filled = functools.partial( collate_fn, input_type=args.input_type, modify_prefix=(not args.no_modify_prefix), target_type=args.target_type, ) if args.test_only: # run on test set print("=== TEST/EVAL ONLY, no training") named_splits = { "train": train_dataset, "valid": eval_dataset, "test": test_dataset, } selected_split = named_splits[args.test_split] eval_iter = DataLoader( selected_split, batch_size=args.eval_batch_size, collate_fn=collate_fn_filled, num_workers=1, worker_init_fn=worker_init_fn, ) results = evaluation(args, model, 0, tokenizer, eval_iter, write_summary=False) print(results["rouge_scores"]) # Save results in JSON file results_filename = Path(args.logdir) / f"{args.test_split}_results.json" with results_filename.open("w") as f: json.dump(results, f, indent=2, sort_keys=True) return model.train() global_step = 0 grad_acc_step = 0 loss_tensor_log = [] train_iter = DataLoader( train_dataset, batch_size=args.per_unit_batch_size, collate_fn=collate_fn_filled, num_workers=args.num_data_workers, worker_init_fn=worker_init_fn, ) eval_iter = DataLoader( eval_dataset, batch_size=args.eval_batch_size, collate_fn=collate_fn_filled, num_workers=args.num_data_workers, worker_init_fn=worker_init_fn, ) for _, (_, input_texts, output_texts) in enumerate(train_iter): if len(input_texts) == 0: continue # Prohibit batches with no contribution summaries at all if sum(len(out) for out in output_texts) == 0: continue # MultiHead + Auxiliary loss (Informativeness) if args.target_type == "both" and args.use_multi_head: if args.input_type == "paper": ips = [i for ip in input_texts for i in ip] elif args.input_type == "all": ips = [list(ip[0]) for ip in input_texts] ops = [o for op in output_texts for o in op] tok_input, tok_output, labels = tokenize_batch( ips, ops, model, tokenizer, args ) # Prepare inputs if args.input_type == "paper": tok_input["input_ids"] = tok_input["input_ids"].view( args.per_unit_batch_size, 2, -1 )[:, 0, :] tok_input["attention_mask"] = tok_input["attention_mask"].view( args.per_unit_batch_size, 2, -1 )[:, 0, :] additional_kwargs = {} elif args.input_type == "all": new_tok_input = {} new_tok_input["input_ids"] = [t["input_ids"] for t in tok_input] new_tok_input["attention_mask"] = [ t["attention_mask"] for t in tok_input ] tok_input = new_tok_input additional_kwargs = { "final_layer": [None, None, None], "input_modes": ["LogL", "MI_inbound", "MI_outbound"], "informativeness": args.use_informativeness, } # b x [cont, ctx] x seq_len tok_output["input_ids"] = tok_output["input_ids"].view( args.per_unit_batch_size, 2, -1 ) tok_output["attention_mask"] = tok_output["attention_mask"].view( args.per_unit_batch_size, 2, -1 ) labels = labels.view(args.per_unit_batch_size, 2, -1) # Fixing the strange behavior of torch.distributed where some values # are overwritten when the sequence length is just one. for b in range(args.per_unit_batch_size): tok_output["input_ids"][b][tok_output["input_ids"][b][:, 0] == 1, 0] = 2 tok_output["attention_mask"][b][ tok_output["attention_mask"][b][:, 0] == 0, 0 ] = 1 labels[b][labels[b][:, 0] == -100, 0] = 2 all_labels = [] all_dec_inputs = [] for idx in range(2): all_dec_inputs.append( dict( input_ids=tok_output["input_ids"][:, idx, :], attention_mask=tok_output["attention_mask"][:, idx, :], ) ) all_labels.append(labels[:, idx, :]) # Disable sync except at the beginning and the end of gradient accumulation if args.distributed: if (grad_acc_step == 0) or ( (grad_acc_step + 1) % args.gradient_accumulation_steps == 0 ): model.require_forward_param_sync = True model.require_backward_grad_sync = True else: model.require_forward_param_sync = False model.require_backward_grad_sync = False outs = model( input_ids=tok_input["input_ids"], attention_mask=tok_input["attention_mask"], decoder_input_ids=[ shift_left(tok_output["input_ids"], tokenizer.bos_token_id) for tok_output in all_dec_inputs ], decoder_attention_mask=[ tok_output["attention_mask"] for tok_output in all_dec_inputs ], lm_labels=all_labels, **additional_kwargs, ) # MultiHead + Informativeness if args.input_type == "all": # losses for generating both contrib & context if args.use_informativeness: # MI_outbound: informativeness contrib_loss = ( outs["LogL"][0][0] + args.aux_scale * outs["MI_outbound"][0][0] ) context_loss = ( outs["LogL"][1][0] + args.aux_scale * outs["MI_inbound"][1][0] ) else: contrib_loss = ( outs["LogL"][0][0] - args.aux_scale * outs["MI_inbound"][0][0] + ( args.aux_scale * outs["MI_outbound"][0][0] if not args.use_adaptive_scale else 0 ) ) context_loss = ( outs["LogL"][1][0] + ( args.aux_scale * outs["MI_inbound"][1][0] if not args.use_adaptive_scale else 0 ) - args.aux_scale * outs["MI_outbound"][1][0] ) loss = (contrib_loss + context_loss) / 2 losses = [ outs["LogL"][0][0], outs["MI_inbound"][0][0], outs["MI_outbound"][0][0], outs["LogL"][1][0], outs["MI_inbound"][1][0], outs["MI_outbound"][1][0], ] # multihead else: # contrib, context losses = [o[0] for o in outs] loss = sum(losses) / len(losses) check_nan(loss) # reporting logL only loss_tensor_log.append( (losses[0] if args.input_type == "all" else loss).detach() ) global_step, grad_acc_step, loss_tensor_log = update_step( args, model, tokenizer, optimizer, loss, losses, eval_iter, global_step, grad_acc_step, loss_tensor_log, ) else: # For compatibility of training loop if args.target_type != "both": input_texts, output_texts = ([input_texts], [output_texts]) input_texts, output_texts = zip(*input_texts), zip(*output_texts) heads = ["contrib", "context"] losses = [] # loop over the two targets for input_text, output_text, head in zip(input_texts, output_texts, heads): tok_input, tok_output, labels = tokenize_batch( input_text, output_text, model, tokenizer, args ) if args.distributed: if (grad_acc_step == 0) or ( (grad_acc_step + 1) % args.gradient_accumulation_steps == 0 ): model.require_forward_param_sync = True model.require_backward_grad_sync = True else: model.require_forward_param_sync = False model.require_backward_grad_sync = False # Auxiliary loss: informativeness if args.input_type == "all": outs = model( input_ids=[t["input_ids"] for t in tok_input], attention_mask=[t["attention_mask"] for t in tok_input], decoder_input_ids=shift_left( tok_output["input_ids"], tokenizer.bos_token_id ), decoder_attention_mask=tok_output["attention_mask"], lm_labels=labels, ) else: outs = model( input_ids=tok_input["input_ids"], attention_mask=tok_input["attention_mask"], decoder_input_ids=shift_left( tok_output["input_ids"], tokenizer.bos_token_id ), decoder_attention_mask=tok_output["attention_mask"], lm_labels=labels, ) if args.input_type == "all": losses += outs[0] if args.target_type == "contrib": if args.use_informativeness: coeff = [ 1, 0, args.aux_scale, ] else: coeff = [ 1, -args.aux_scale, args.aux_scale if not args.use_adaptive_scale else 0, ] elif args.target_type == "context": if args.use_informativeness: coeff = [ 1, args.aux_scale, 0, ] else: coeff = [ 1, args.aux_scale if not args.use_adaptive_scale else 0, -args.aux_scale, ] loss = sum(l * c for l, c in zip(outs[0], coeff)) elif args.use_multi_head: loss = outs[0 if head == "contrib" else 1][0] else: loss = outs[0] check_nan(loss) loss_tensor_log.append( (losses[0] if args.input_type == "all" else loss).detach() ) global_step, grad_acc_step, loss_tensor_log = update_step( args, model, tokenizer, optimizer, loss, losses, eval_iter, global_step, grad_acc_step, loss_tensor_log, )