def time_profiling(results_dir, model, task_sampler, loader_train, device, config): """ Measure the time necessary to perform forward / backward prop on current device :param results_dir: :param model: :param task_sampler: :param loader_train: :param device: :param config: :return: """ # creating input all_tasks = task_sampler.get_all(return_metrics=False) cycle_train = cycle(loader_train) # training stuff optimizer = get_optimizer(model.parameters(), config["TRAIN"]["OPTIMIZER"]) criterion = get_criterion(config["TRAIN"]["CRITERION"]) # output times_forward = [] times_backward = [] # warm-up for i in tqdm(range(10), ncols=100): task = all_tasks[i] x, y = next(cycle_train) x, y = x.to(device), y.to(device) preds = model(x, task) loss = criterion(preds, y) loss.backward() for i in tqdm(range(len(all_tasks)), ncols=100): task = all_tasks[i] x, y = next(cycle_train) x, y = x.to(device), y.to(device) # forward time t_f = time.time() pred = model(x, task) dt_f = time.time() - t_f times_forward.append(dt_f) # backward time t_b = time.time() loss = criterion(pred, y) loss.backward() optimizer.step() optimizer.zero_grad() dt_b = time.time() - t_b times_backward.append(dt_b) with open(os.path.join(results_dir, "forward_times.pkl"), "wb") as f: pickle.dump(times_forward, f) with open(os.path.join(results_dir, "backward_times.pkl"), "wb") as f: pickle.dump(times_backward, f)
def __init__(self, hparams): super(Net, self).__init__() # self.hparams = hparams self.hparams.update(vars(hparams)) self.model = get_model(hparams) self.criterion = get_criterion(args) if hparams.cutmix: self.cutmix = CutMix(hparams.size, beta=1.) if hparams.mixup: self.mixup = MixUp(alpha=1.) self.log_image_flag = hparams.api_key is None
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ckpt', type=str, required=True) parser.add_argument('--outdir', type=str, default='test_out') parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--device', type=str, default='cuda') args = parser.parse_args() ckpt = torch.load(args.ckpt) config = ckpt['config'] state_dict = ckpt['state_dict'] epoch = ckpt['epoch'] if args.outdir is None: outdir = pathlib.Path(args.ckpt).parent else: outdir = pathlib.Path(args.outdir) outdir.mkdir(exist_ok=True, parents=True) use_gpu = args.device != 'cpu' and torch.cuda.is_available() device = torch.device('cuda' if use_gpu else 'cpu') data_config = config['data_config'] data_config['batch_size'] = args.batch_size data_config['num_workers'] = args.num_workers data_config['use_gpu'] = use_gpu _, test_loader = get_loader(data_config) model = utils.load_model(config['model_config']) try: model.load_state_dict(state_dict) except Exception: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) model = model.module model.to(device) _, test_criterion = utils.get_criterion(config['data_config']) preds, probs, labels, loss, acc = predict(model, test_criterion, test_loader, device) outpath = outdir / 'preds_{:04}.npz'.format(epoch) np.savez(outpath, preds=preds, probs=probs, labels=labels, loss=loss, acc=acc)
def ft_weights(model, task, train_iter, device, config): """ Fine-tune the weights of the super-net to the task at hand :param model: :param task: :param train_iter: :param device: :param config: :return: """ # fine-tuning stuff model.train() # optimizer optimizer = get_optimizer(model.parameters(), config["TRAIN"]["OPTIMIZER"]) # scheduler scheduler_config = { "name": config["TRAIN"]["SCHEDULER"]["name"], "T_max": config["EVAL"]["n_ft_weights"] } scheduler = get_scheduler(optimizer, scheduler_config) # criterion criterion = get_criterion(config["TRAIN"]["CRITERION"]) n_steps = 0 while n_steps < config["EVAL"]["n_ft_weights"]: for x_t, y_t in train_iter: x_t, y_t = x_t.to(device), y_t.to(device) preds_t = model.forward(x_t, task) loss_t = criterion(preds_t, y_t) optimizer.zero_grad() loss_t.backward() optimizer.step() scheduler.step() n_steps += 1 if n_steps >= config["EVAL"]["n_ft_weights"]: break
def main(args): model = get_model(args).to(device) optimizer = get_optimizer(args, model) lr_scheduler = get_scheduler(args, optimizer) criterion = get_criterion(args) start_epoch = 0 trainloader, valloader, testloader = load_data(args) CHECKPOINT_PATH = f'{args.checkpoint_dir}/checkpoint.tar' if os.path.exists(CHECKPOINT_PATH): model, optimizer, lr_scheduler, start_epoch = checkpoint_load( model, optimizer, lr_scheduler, CHECKPOINT_PATH) print('Started training!') mean_losses = [] for epoch in range(start_epoch, args.n_epochs): if lr_scheduler is not None: lr_scheduler.step() mean_train_loss = train_epoch(args, model, optimizer, criterion, trainloader) mean_losses.append(mean_train_loss) train_log(args, epoch, model, criterion, valloader, mean_train_loss) if epoch % args.checkpoint_interval == 0: checkpoint_save(model, optimizer, lr_scheduler, epoch, CHECKPOINT_PATH) if args.final_save_fpath is not None: torch.save( { 'mean_train_loss': np.mean(mean_losses), 'args': vars(args), 'model_state_dict': model.state_dict() }, args.final_save_fpath)
def __init__(self, params, ispretrain): super(EncoderTrainer, self).__init__() self.ispretrain = ispretrain self.input_option = params['input_option'] self.weight = params # initiate the network modules #self.model = resnet34_Mano(ispretrain=ispretrain, input_option=params['input_option']) self.model = torch.nn.DataParallel( resnet34_Mano(input_option=params['input_option'])) self.model = self.model.module self.mean_3d = torch.zeros(3) # setup the optimizer lr = params.lr beta1 = params.beta1 beta2 = params.beta2 #p_view = self.model.state_dict() self.encoder_opt = torch.optim.Adam( [p for p in self.model.parameters() if p.requires_grad], lr=lr, betas=(beta1, beta2), weight_decay=params.weight_decay) self.encoder_opt = nn.DataParallel(self.encoder_opt).module self.encoder_scheduler = get_scheduler(self.encoder_opt, params) # set loss fn if self.ispretrain: self.param_recon_criterion = get_criterion( params['pretrain_loss_fn']) # Network weight initialization self.model.apply(weights_init(params.init)) self.transformer = mm2px.JointTransfomer('BB')
def main(): # parse command line argument and generate config dictionary config = parse_args() logger.info(json.dumps(config, indent=2)) run_config = config['run_config'] optim_config = config['optim_config'] # TensorBoard SummaryWriter if run_config['tensorboard']: writer = SummaryWriter(run_config['outdir']) else: writer = None # set random seed seed = run_config['seed'] torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) epoch_seeds = np.random.randint( np.iinfo(np.int32).max // 2, size=optim_config['epochs']) # create output directory outdir = pathlib.Path(run_config['outdir']) outdir.mkdir(exist_ok=True, parents=True) # save config as json file in output directory outpath = outdir / 'config.json' with open(outpath, 'w') as fout: json.dump(config, fout, indent=2) # load data loaders train_loader, test_loader = get_loader(config['data_config']) # load model logger.info('Loading model...') model = utils.load_model(config['model_config']) n_params = sum([param.view(-1).size()[0] for param in model.parameters()]) logger.info('n_params: {}'.format(n_params)) if run_config['fp16']: model.half() for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() device = run_config['device'] if device is not 'cpu' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) logger.info('Done') train_criterion, test_criterion = utils.get_criterion( config['data_config']) # create optimizer optim_config['steps_per_epoch'] = len(train_loader) optimizer, scheduler = utils.create_optimizer(model.parameters(), optim_config) # run test before start training if run_config['test_first']: test(0, model, test_criterion, test_loader, run_config, writer) state = { 'config': config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'accuracy': 0, 'best_accuracy': 0, 'best_epoch': 0, } epoch_logs = [] for epoch, seed in zip(range(1, optim_config['epochs'] + 1), epoch_seeds): np.random.seed(seed) # train train_log = train(epoch, model, optimizer, scheduler, train_criterion, train_loader, config, writer) # test test_log = test(epoch, model, test_criterion, test_loader, run_config, writer) epoch_log = train_log.copy() epoch_log.update(test_log) epoch_logs.append(epoch_log) utils.save_epoch_logs(epoch_logs, outdir) # update state dictionary state = update_state(state, epoch, epoch_log['test']['accuracy'], model, optimizer) # save model utils.save_checkpoint(state, outdir)
def train_scale_equiv(model,n_epochs,train_loader_sup,train_dataset_unsup,val_loader,criterion_supervised,optimizer,scheduler,\ Loss,gamma,batch_size,save_folder,model_name,benchmark=False,angle_max=30,size_img=520,scale_factor=(0.5,1.2),\ save_all_ep=True,dataroot_voc='~/data/voc2012',save_best=False, device='cpu',num_classes=21): """ A complete training of rotation equivariance supervised model. save_folder : Path to save the model, the courb of losses,metric... benchmark : enable or disable backends.cudnn Loss : Loss for unsupervised training 'KL' 'CE' 'L1' or 'MSE' gamma : float btwn [0,1] -> Balancing two losses loss_sup*gamma + (1-gamma)*loss_unsup save_all_ep : if True, the model is saved at each epoch in save_folder scheduler : if True, the model will apply a lr scheduler during training eval_every : Eval Model with different input image angle every n step size_img : size of image during evaluation scale_factor : scale between min*size_img and max*size_img """ torch.backends.cudnn.benchmark = benchmark if scheduler: lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: (1 - x / (len(train_loader_sup) * n_epochs))**0.9) criterion_unsupervised = U.get_criterion(Loss) print('Criterion Unsupervised', criterion_unsupervised) iou_train = [] iou_test = [] combine_loss_train = [] combine_loss_test = [] loss_train_unsup = [] loss_train_sup = [] loss_test_unsup = [] loss_test_sup = [] equiv_accuracy_train = [] equiv_accuracy_test = [] accuracy_test = [] accuracy_train = [] torch.autograd.set_detect_anomaly(True) for ep in range(n_epochs): train_loader_equiv = torch.utils.data.DataLoader(train_dataset_unsup,batch_size=batch_size,\ shuffle=True,drop_last=True) print("EPOCH", ep) # TRAINING d = train_step_scale_equiv(model,train_loader_sup,train_loader_equiv,criterion_supervised,criterion_unsupervised,\ optimizer,gamma,Loss,device,size_img=size_img,scale_factor=scale_factor) if scheduler: lr_scheduler.step() combine_loss_train.append(d['loss']) loss_train_unsup.append(d['loss_equiv']) loss_train_sup.append(d['loss_sup']) equiv_accuracy_train.append(d['equiv_acc']) iou_train.append(d['iou_train']) accuracy_train.append(d['accuracy_train']) print('TRAIN - EP:',ep,'iou:',d['iou_train'],'Accuracy:',d['accuracy_train'],'Loss sup:',d['loss_sup'],\ 'Loss equiv:',d['loss_equiv'],'Combine Loss:',d['loss'],'Equivariance Accuracy:',d['equiv_acc'],) # EVALUATION model.eval() with torch.no_grad(): state = eval_model(model, val_loader, device=device, num_classes=num_classes) iou = state.metrics['mean IoU'] acc = state.metrics['accuracy'] loss = state.metrics['CE Loss'] loss_test_sup.append(loss) iou_test.append(iou) accuracy_test.append(acc) print('TEST - EP:', ep, 'iou:', iou, 'Accuracy:', acc, 'Loss CE', loss) U.save_curves(path=save_folder,combine_loss_train=combine_loss_train,loss_train_sup=loss_train_sup,\ loss_train_unsup=loss_train_unsup,iou_train=iou_train,accuracy_train=accuracy_train,equiv_accuracy_train=equiv_accuracy_train,\ combine_loss_test=combine_loss_test,loss_test_unsup=loss_test_unsup,equiv_accuracy_test=equiv_accuracy_test,\ loss_test_sup= loss_test_sup,iou_test=iou_test,accuracy_test=accuracy_test)
def main(): config = utils.parse_args() if config['cuda'] and torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' dataset_args = (config['task'], config['dataset'], config['dataset_path'], 'train', config['num_layers'], config['self_loop'], config['normalize_adj'], config['transductive']) dataset = utils.get_dataset(dataset_args) loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=dataset.collate_wrapper) input_dim, output_dim = dataset.get_dims() model = models.GAT(input_dim, config['hidden_dims'], output_dim, config['num_heads'], config['dropout'], device) model.to(device) if not config['load']: criterion = utils.get_criterion(config['task']) optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) epochs = config['epochs'] stats_per_batch = config['stats_per_batch'] num_batches = int(ceil(len(dataset) / config['batch_size'])) model.train() print('--------------------------------') print('Training.') for epoch in range(epochs): print('Epoch {} / {}'.format(epoch + 1, epochs)) running_loss = 0.0 num_correct, num_examples = 0, 0 for (idx, batch) in enumerate(loader): features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) optimizer.zero_grad() out = model(features, node_layers, mappings, rows) loss = criterion(out, labels) loss.backward() optimizer.step() with torch.no_grad(): running_loss += loss.item() predictions = torch.max(out, dim=1)[1] num_correct += torch.sum(predictions == labels).item() num_examples += len(labels) if (idx + 1) % stats_per_batch == 0: running_loss /= stats_per_batch accuracy = num_correct / num_examples print(' Batch {} / {}: loss {}, accuracy {}'.format( idx + 1, num_batches, running_loss, accuracy)) running_loss = 0.0 num_correct, num_examples = 0, 0 print('Finished training.') print('--------------------------------') if config['save']: print('--------------------------------') directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') if not os.path.exists(directory): os.makedirs(directory) fname = utils.get_fname(config) path = os.path.join(directory, fname) print('Saving model at {}'.format(path)) torch.save(model.state_dict(), path) print('Finished saving model.') print('--------------------------------') if config['load']: directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') fname = utils.get_fname(config) path = os.path.join(directory, fname) model.load_state_dict(torch.load(path)) dataset_args = (config['task'], config['dataset'], config['dataset_path'], 'test', config['num_layers'], config['self_loop'], config['normalize_adj'], config['transductive']) dataset = utils.get_dataset(dataset_args) loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=dataset.collate_wrapper) criterion = utils.get_criterion(config['task']) stats_per_batch = config['stats_per_batch'] num_batches = int(ceil(len(dataset) / config['batch_size'])) model.eval() print('--------------------------------') print('Testing.') running_loss, total_loss = 0.0, 0.0 num_correct, num_examples = 0, 0 total_correct, total_examples = 0, 0 for (idx, batch) in enumerate(loader): features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) loss = criterion(out, labels) running_loss += loss.item() total_loss += loss.item() predictions = torch.max(out, dim=1)[1] num_correct += torch.sum(predictions == labels).item() total_correct += torch.sum(predictions == labels).item() num_examples += len(labels) total_examples += len(labels) if (idx + 1) % stats_per_batch == 0: running_loss /= stats_per_batch accuracy = num_correct / num_examples print(' Batch {} / {}: loss {}, accuracy {}'.format( idx + 1, num_batches, running_loss, accuracy)) running_loss = 0.0 num_correct, num_examples = 0, 0 total_loss /= num_batches total_accuracy = total_correct / total_examples print('Loss {}, accuracy {}'.format(total_loss, total_accuracy)) print('Finished testing.') print('--------------------------------')
#model_name = 'rot_equiv_lc.pt' # saved model name model_name = 'rot_equiv_lc.pt' folder_model = join(load_dir, exp) #fcn= True #pretrained=True # GPU gpu = 1 # EVAL PARAMETERS bs = 2 # LOSS criterion_supervised = nn.CrossEntropyLoss( ignore_index=21) # On ignore la classe border. Loss = 'KL' # Loss = 'KL' or 'CE' or None for L1,MSE… criterion_unsupervised = U.get_criterion(Loss) # SEARCH FOR A PARTICULAR MODEL rotate = False # random rotation during training scale = False split = True # split the supervised dataset split_ratio = 0.3 batch_size = 4 pi_rotate = False #scale_factor = (0.2,0.8) #size_img = (420,420) #size_crop = (380,380) # DEVICE # Decide which device we want to run on
def train_rot_equiv(model,n_epochs,train_loader_sup,train_dataset_unsup,val_loader,criterion_supervised,optimizer,scheduler,\ Loss,gamma,batch_size,iter_every,save_folder,model_name,benchmark=False,angle_max=30,size_img=520,\ eval_every=5,save_all_ep=True,dataroot_voc='~/data/voc2012',save_best=False,rot_cpu=False, device='cpu',num_classes=21): """ A complete training of rotation equivariance supervised model. save_folder : Path to save the model, the courb of losses,metric... benchmark : enable or disable backends.cudnn Loss : Loss for unsupervised training 'KL' 'CE' 'L1' or 'MSE' gamma : float btwn [0,1] -> Balancing two losses loss_sup*gamma + (1-gamma)*loss_unsup save_all_ep : if True, the model is saved at each epoch in save_folder scheduler : if True, the model will apply a lr scheduler during training eval_every : Eval Model with different input image angle every n step size_img : size of image during evaluation angle_max : max angle rotation for input images """ torch.backends.cudnn.benchmark = benchmark if scheduler: lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: (1 - x / (len(train_loader_sup) * n_epochs))**0.9) criterion_unsupervised = U.get_criterion(Loss) iou_train = [] iou_test = [] combine_loss_train = [] combine_loss_test = [] loss_train_unsup = [] loss_train_sup = [] loss_test_unsup = [] loss_test_sup = [] equiv_accuracy_train = [] equiv_accuracy_test = [] accuracy_test = [] accuracy_train = [] for ep in range(n_epochs): train_loader_equiv = torch.utils.data.DataLoader(train_dataset_unsup,batch_size=batch_size,\ shuffle=True,drop_last=True) print("EPOCH", ep) # TRAINING d = train_step_rot_equiv(model,train_loader_sup,train_loader_equiv,criterion_supervised,criterion_unsupervised,\ optimizer,gamma,Loss,rot_cpu=rot_cpu,device=device,angle_max=angle_max,num_classes=num_classes,iter_every=iter_every) if scheduler: lr_scheduler.step() combine_loss_train.append(d['loss']) loss_train_unsup.append(d['loss_equiv']) loss_train_sup.append(d['loss_sup']) equiv_accuracy_train.append(d['equiv_acc']) iou_train.append(d['iou_train']) accuracy_train.append(d['accuracy_train']) print('TRAIN - EP:',ep,'iou:',d['iou_train'],'Accuracy:',d['accuracy_train'],'Loss sup:',d['loss_sup'],\ 'Loss equiv:',d['loss_equiv'],'Combine Loss:',d['loss'],'Equivariance Accuracy:',d['equiv_acc'],) # EVALUATION model.eval() with torch.no_grad(): state = eval_model(model, val_loader, device=device, num_classes=num_classes) iou = state.metrics['mean IoU'] acc = state.metrics['accuracy'] loss = state.metrics['CE Loss'] loss_test_sup.append(loss) iou_test.append(iou) accuracy_test.append(acc) print('TEST - EP:', ep, 'iou:', iou, 'Accuracy:', acc, 'Loss CE', loss) # SAVING MODEL U.save_model(model, save_all_ep, save_best, save_folder, model_name, ep=ep, iou=iou, iou_test=iou_test) if ep % eval_every == 0: # Eval loss equiv and equivariance accuracy for the validation dataset equiv_acc, m_loss_equiv = U.eval_accuracy_equiv(model,val_loader,criterion=criterion_unsupervised,\ nclass=21,device=device,Loss=Loss,plot=False,angle_max=angle_max,random_angle=False) loss_test_unsup.append(m_loss_equiv) equiv_accuracy_test.append(equiv_acc) """ print('VOC Dataset Train') _ = eval_model_all_angle(model,size_img,dataroot_voc,train=True,device=device,num_classes=num_classes) print('VOC Dataset Val') _ = eval_model_all_angle(model,size_img,dataroot_voc,train=False,device=device,num_classes=num_classes) ## Save model""" U.save_curves(path=save_folder,combine_loss_train=combine_loss_train,loss_train_sup=loss_train_sup,\ loss_train_unsup=loss_train_unsup,iou_train=iou_train,accuracy_train=accuracy_train,equiv_accuracy_train=equiv_accuracy_train,\ combine_loss_test=combine_loss_test,loss_test_unsup=loss_test_unsup,equiv_accuracy_test=equiv_accuracy_test,\ loss_test_sup= loss_test_sup,iou_test=iou_test,accuracy_test=accuracy_test)
def __init__(self, train_data, model, dev_data=None, eval_every=-1, patience=200, loss_fn="bce", train_batch_size=32, verbose=True, eval_on="loss", device="cpu", save_path=None, train_epochs=5, keep_ck_num=3, lr=1e-2, eval_batch_size=64, seed=211, use_wandb=False): set_seed(seed) if not os.path.isdir(save_path): os.makedirs(save_path, exist_ok=True) if len(os.listdir(save_path)) > 1: out = input( "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)" .format(save_path)) if out.lower() == "y": shutil.rmtree(save_path) os.makedirs(save_path, exist_ok=True) # we need keep the vocab file train_data.save_vocab(save_path) else: raise ValueError( "Output directory ({}) already exists and is not empty". format(save_path)) self.tb_writer = SummaryWriter() self.eval_every = -1 self.keep_ck_num = keep_ck_num self.train_data = train_data self.train_batch_size = train_batch_size self.train_dataloader = DataLoader(train_data, batch_size=train_batch_size, shuffle=True) self.set_logger(save_path) self.total_train_steps = len(self.train_dataloader) * train_epochs if verbose: logger.info(model) total_count, trainable_count, non_trainable_count = count_dm_params( model) logger.info(f' Total params: {total_count}') logger.info(f' Trainable params: {trainable_count}') logger.info(f' Non-trainable params: {non_trainable_count}') logger.info(f" There are {len(train_data)} training examples") if dev_data != None: logger.info( f" There are {len(dev_data)} examples for development") self.model = model.to(device) self.train_epochs = train_epochs self.device = device self.eval_batch_size = eval_batch_size self.dev_data = dev_data self.criterion = get_criterion(loss_fn) self.loss_fn = loss_fn # self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9) self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8) # use default self.lr = lr self.eval_every = eval_every self.save_path = save_path assert eval_on in ["log_loss", "auc", "loss", "accuracy"] self.eval_on = eval_on self.best_score = -float("inf") self.patience = patience self.no_improve_count = 0 self.use_wandb = use_wandb self.hyperparams_logging = [ "train_epochs", "eval_batch_size", "train_batch_size", "no_improve_count", "device", "patience", "save_path", "eval_on", "eval_every", "use_wandb", "loss_fn", "keep_ck_num", "lr" ] self.hyperparam_dict = { key: self.__dict__[key] for key in self.hyperparams_logging } if is_wandb_available() and use_wandb: # keep track of model topology and gradients if is_wandb_available and args!=None wandb.init(project="deep_ctr", config=self.hyperparam_dict, name="_".join(save_path.split(os.path.sep))) wandb.watch((self.model), log_freq=max(100, eval_every))
logger.addHandler(fh) logger.addHandler(ch) logger.info(' '.join(sys.argv)) for m in msg: logger.warn(m) config, data_config = parse_config(args.config) logger.info("Get model") model = get_model(config['model']['name'], **config['model']['params']) logger.info("Get optimizer, scheduler and criterion") optimizer = get_optimizer(config['optimizer'], model.parameters()) scheduler = get_scheduler(config['scheduler'], optimizer) criterion = get_criterion(config['criterion']) logger.info("Get trainer") trainer = Trainer(save_path=args.dir, model=model, optimizer=optimizer, criterion=criterion, # max_epoch=config['max_epoch'], max_steps=config['max_steps'], logger=logger, scheduler=scheduler, auto_resume=args.resume, log_frequence=args.log_frequence, save_frequence=args.save_frequence, eval_frequence=args.eval_frequence)
def train_supernet(results_dir, model, task_sampler, train_iter, valid_iter, device, config): """ :param results_dir: :param model: :param task_sampler: :param train_iter: :param valid_iter: :param device: :param config: :return: """ writer = None since = time.time() seed = set_seed(config["TRAIN"]["train_seed"]) config["TRAIN"]["train_seed"] = seed with open(os.path.join(results_dir, "config.yaml"), "w") as f: yaml.dump(config, f) # metrics total_metrics = { "train": [], "valid": [], } # data iterators iters = {"train": train_iter, "valid": valid_iter} # training stuff optimizer = get_optimizer(model.parameters(), config["TRAIN"]["OPTIMIZER"]) scheduler = get_scheduler(optimizer, config["TRAIN"]["SCHEDULER"]) criterion = get_criterion(config["TRAIN"]["CRITERION"]) # training for epoch in range(config["TRAIN"]["num_epochs"]): print("-" * 100) print("Iter Epoch {}/{}".format(epoch + 1, config["TRAIN"]["num_epochs"])) print("-" * 100) epoch_metrics = { "train": { "learning_rate": [], "losses_train": [], "accs_train": [], }, "valid": { "losses_valid": [], "accs_valid": [], } } for phase in ["train", "valid"]: for iter_cpt, (x, y) in tqdm(enumerate(iters[phase]), ncols=100, total=len(iters[phase])): # perform an update if phase == "train": model.train() tasks = task_sampler.sample(n_monte=config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"]) loss_t = None accs_t = [] for task in tasks: # forward x_t, y_t = x.to(device), y.to(device) preds_t = model.forward(x_t, task) # computing gradient if loss_t is None: loss_t = criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"] else: loss_t += criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"] # saving accuracies accs_t.append(np.mean((torch.max(preds_t, dim=1)[1] == y_t).cpu().numpy())) # update loss_t.backward() optimizer.step() scheduler.step(epoch) model.none_grad() # adding metrics epoch_metrics[phase]["learning_rate"].append(scheduler.get_lr()) epoch_metrics[phase]["losses_train"].append(loss_t.item()) epoch_metrics[phase]["accs_train"].append(np.mean(accs_t)) elif config["TRAIN"]["perform_valid"]: model.eval() task = task_sampler.sample()[0] # forward x_v, y_v = x.to(device), y.to(device) with torch.no_grad(): preds_v = model.forward(x_v, task) loss_v = criterion(preds_v, y_v) # adding metrics epoch_metrics[phase]["losses_valid"].append(loss_v.item()) epoch_metrics[phase]["accs_valid"].append( np.mean((torch.max(preds_v, dim=1)[1] == y_v).cpu().numpy())) else: break # average metrics over epoch to_print = "\n" for phase in ["train", "valid"]: to_print += phase.upper() + ":\n" for key in epoch_metrics[phase].keys(): if len(epoch_metrics[phase][key]) > 0: epoch_metrics[phase][key] = np.mean(epoch_metrics[phase][key]) to_print += "{}: {:.4f}".format(key, epoch_metrics[phase][key]) + "\n" else: epoch_metrics[phase][key] = None total_metrics[phase].append(epoch_metrics[phase]) to_print += "\n" # tensorboard integration to plot nice curves if config["TRAIN"]["use_tensorboard"]: if config["TRAIN"]["use_tensorboard"] and writer is None: writer = SummaryWriter(results_dir) for phase in ["train", "valid"]: for key, value in epoch_metrics[phase].items(): if value is not None: writer.add_scalar(phase + "/" + key, value, epoch) time_elapsed = time.time() - since print(to_print + "Time Elapsed: {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) # save everything if config["TRAIN"]["save"] and ((epoch + 1) % config["TRAIN"]["save_period"] == 0): # saving model weights_path = os.path.join(results_dir, "model_weights_epoch_{0}_of_{1}.pth". format(epoch + 1, config["TRAIN"]["num_epochs"])) torch.save(model.state_dict(), weights_path) # saving stuff to retrieve with open(os.path.join(results_dir, "total_metrics.pkl"), "wb") as handle: pickle.dump(total_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL) time_elapsed = time.time() - since print("Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)) return total_metrics
def main(): config = utils.parse_args() if config['cuda'] and torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' dataset_args = (config['task'], config['dataset'], config['dataset_path'], config['num_layers'], config['self_loop'], config['normalize_adj']) dataset = utils.get_dataset(dataset_args) input_dim, output_dim = dataset.get_dims() adj, features, labels, idx_train, idx_val, idx_test = dataset.get_data() x = features y_train = labels[idx_train] y_val = labels[idx_val] y_test = labels[idx_test] model = models.GCN(input_dim, config['hidden_dims'], output_dim, config['dropout']) model.to(device) if not config['load']: criterion = utils.get_criterion(config['task']) optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) epochs = config['epochs'] model.train() print('--------------------------------') print('Training.') for epoch in range(epochs): optimizer.zero_grad() scores = model(x, adj)[idx_train] loss = criterion(scores, y_train) loss.backward() optimizer.step() predictions = torch.max(scores, dim=1)[1] num_correct = torch.sum(predictions == y_train).item() accuracy = num_correct / len(y_train) print(' Training epoch: {}, loss: {:.3f}, accuracy: {:.2f}'. format(epoch + 1, loss.item(), accuracy)) print('Finished training.') print('--------------------------------') if config['save']: print('--------------------------------') directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') if not os.path.exists(directory): os.makedirs(directory) fname = utils.get_fname(config) path = os.path.join(directory, fname) print('Saving model at {}'.format(path)) torch.save(model.state_dict(), path) print('Finished saving model.') print('--------------------------------') if config['load']: directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') fname = utils.get_fname(config) path = os.path.join(directory, fname) model.load_state_dict(torch.load(path)) model.eval() print('--------------------------------') print('Testing.') scores = model(x, adj)[idx_test] predictions = torch.max(scores, dim=1)[1] num_correct = torch.sum(predictions == y_test).item() accuracy = num_correct / len(y_test) print(' Test accuracy: {}'.format(accuracy)) print('Finished testing.') print('--------------------------------')
if args.task_name not in processors: raise ValueError('Task not found: %s' % (args.task_name)) processor = processors[args.task_name]() args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] print('Training/evaluation parameters %s' % str(args)) if args.do_train: config = config_class.from_pretrained( args.config_name or args.model_name_or_path, finetuning_task=args.task_name, ) tokenizer = get_tokenizer(args.model_type, args.tokenizer_name or args.model_name_or_path) criterion = get_criterion(args.model_type, tokenizer) print(f'*** Criterion ignore_index = {criterion.ignore_index} ***') criterion.to(args.device) model = model_class(config=config) if args.model_type == 'gpt2': origin_vocab_size = model.config.vocab_size new_vocab_size = tokenizer.vocab_size embed_size = model.config.n_embd if origin_vocab_size < new_vocab_size: print(f'***** Adjusting gpt2 embedding *****') wte = torch.nn.Embedding(new_vocab_size, embed_size) wte.weight.data[:origin_vocab_size].copy_(model.transformer.wte.weight.data) print(f'replace wte: ({model.transformer.wte.weight.data.size()}) -> ({wte.weight.data.size()})') model.transformer.wte = wte lm_head = torch.nn.Linear(embed_size, new_vocab_size, bias=False)
def main(): # Set up arguments for datasets, models and training. config = utils.parse_args() config['num_layers'] = len(config['hidden_dims']) + 1 if config['cuda'] and torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' config['device'] = device # Get the dataset, dataloader and model. dataset_args = (config['task'], config['dataset'], config['dataset_path'], config['generate_neg_examples'], 'train', config['duplicate_examples'], config['repeat_examples'], config['num_layers'], config['self_loop'], config['normalize_adj']) dataset = utils.get_dataset(dataset_args) loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=dataset.collate_wrapper) input_dim, output_dim = dataset.get_dims() if config['model'] == 'GraphSAGE': agg_class = utils.get_agg_class(config['agg_class']) model = models.GraphSAGE(input_dim, config['hidden_dims'], output_dim, config['dropout'], agg_class, config['num_samples'], config['device']) else: model = models.GAT(input_dim, config['hidden_dims'], output_dim, config['num_heads'], config['dropout'], config['device']) model.apply(models.init_weights) model.to(config['device']) print(model) # Compute ROC-AUC score for the untrained model. if not config['load']: print('--------------------------------') print( 'Computing ROC-AUC score for the training dataset before training.' ) y_true, y_scores = [], [] num_batches = int(ceil(len(dataset) / config['batch_size'])) with torch.no_grad(): for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] y_true.extend(labels.detach().cpu().numpy()) y_scores.extend(scores.detach().cpu().numpy()) print(' Batch {} / {}'.format(idx + 1, num_batches)) y_true = np.array(y_true).flatten() y_scores = np.array(y_scores).flatten() area = roc_auc_score(y_true, y_scores) print('ROC-AUC score: {:.4f}'.format(area)) print('--------------------------------') # Train. if not config['load']: use_visdom = config['visdom'] if use_visdom: vis = visdom.Visdom() loss_window = None criterion = utils.get_criterion(config['task']) optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) epochs = config['epochs'] stats_per_batch = config['stats_per_batch'] num_batches = int(ceil(len(dataset) / config['batch_size'])) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.8) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[300, 600], gamma=0.5) model.train() print('--------------------------------') print('Training.') for epoch in range(epochs): print('Epoch {} / {}'.format(epoch + 1, epochs)) running_loss = 0.0 for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) optimizer.zero_grad() out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] loss = criterion(scores, labels.float()) loss.backward() optimizer.step() with torch.no_grad(): running_loss += loss.item() if (idx + 1) % stats_per_batch == 0: running_loss /= stats_per_batch print(' Batch {} / {}: loss {:.4f}'.format( idx + 1, num_batches, running_loss)) if (torch.sum(labels.long() == 0).item() > 0) and (torch.sum(labels.long() == 1).item() > 0): area = roc_auc_score(labels.detach().cpu().numpy(), scores.detach().cpu().numpy()) print(' ROC-AUC score: {:.4f}'.format(area)) running_loss = 0.0 num_correct, num_examples = 0, 0 if use_visdom: if loss_window is None: loss_window = vis.line(Y=[loss.item()], X=[epoch * num_batches + idx], opts=dict(xlabel='batch', ylabel='Loss', title='Training Loss', legend=['Loss'])) else: vis.line([loss.item()], [epoch * num_batches + idx], win=loss_window, update='append') scheduler.step() if use_visdom: vis.close(win=loss_window) print('Finished training.') print('--------------------------------') if not config['load']: if config['save']: print('--------------------------------') directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') if not os.path.exists(directory): os.makedirs(directory) fname = utils.get_fname(config) path = os.path.join(directory, fname) print('Saving model at {}'.format(path)) torch.save(model.state_dict(), path) print('Finished saving model.') print('--------------------------------') # Compute ROC-AUC score after training. if not config['load']: print('--------------------------------') print( 'Computing ROC-AUC score for the training dataset after training.' ) y_true, y_scores = [], [] num_batches = int(ceil(len(dataset) / config['batch_size'])) with torch.no_grad(): for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] y_true.extend(labels.detach().cpu().numpy()) y_scores.extend(scores.detach().cpu().numpy()) print(' Batch {} / {}'.format(idx + 1, num_batches)) y_true = np.array(y_true).flatten() y_scores = np.array(y_scores).flatten() area = roc_auc_score(y_true, y_scores) print('ROC-AUC score: {:.4f}'.format(area)) print('--------------------------------') # Plot the true positive rate and true negative rate vs threshold. if not config['load']: tpr, fpr, thresholds = roc_curve(y_true, y_scores) tnr = 1 - fpr plt.plot(thresholds, tpr, label='tpr') plt.plot(thresholds, tnr, label='tnr') plt.xlabel('Threshold') plt.title('TPR / TNR vs Threshold') plt.legend() plt.show() # Choose an appropriate threshold and generate classification report on the train set. idx1 = np.where(tpr <= tnr)[0] idx2 = np.where(tpr >= tnr)[0] t = thresholds[idx1[-1]] total_correct, total_examples = 0, 0 y_true, y_pred = [], [] num_batches = int(ceil(len(dataset) / config['batch_size'])) with torch.no_grad(): for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] predictions = (scores >= t).long() y_true.extend(labels.detach().cpu().numpy()) y_pred.extend(predictions.detach().cpu().numpy()) total_correct += torch.sum(predictions == labels.long()).item() total_examples += len(labels) print(' Batch {} / {}'.format(idx + 1, num_batches)) print('Threshold: {:.4f}, accuracy: {:.4f}'.format( t, total_correct / total_examples)) y_true = np.array(y_true).flatten() y_pred = np.array(y_pred).flatten() report = classification_report(y_true, y_pred) print('Classification report\n', report) # Evaluate on the validation set. if config['load']: directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') fname = utils.get_fname(config) path = os.path.join(directory, fname) model.load_state_dict(torch.load(path)) dataset_args = (config['task'], config['dataset'], config['dataset_path'], config['generate_neg_examples'], 'val', config['duplicate_examples'], config['repeat_examples'], config['num_layers'], config['self_loop'], config['normalize_adj']) dataset = utils.get_dataset(dataset_args) loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=dataset.collate_wrapper) criterion = utils.get_criterion(config['task']) stats_per_batch = config['stats_per_batch'] num_batches = int(ceil(len(dataset) / config['batch_size'])) model.eval() print('--------------------------------') print( 'Computing ROC-AUC score for the validation dataset after training.' ) running_loss, total_loss = 0.0, 0.0 num_correct, num_examples = 0, 0 total_correct, total_examples = 0, 0 y_true, y_scores, y_pred = [], [], [] for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] loss = criterion(scores, labels.float()) running_loss += loss.item() total_loss += loss.item() predictions = (scores >= t).long() num_correct += torch.sum(predictions == labels.long()).item() total_correct += torch.sum(predictions == labels.long()).item() num_examples += len(labels) total_examples += len(labels) y_true.extend(labels.detach().cpu().numpy()) y_scores.extend(scores.detach().cpu().numpy()) y_pred.extend(predictions.detach().cpu().numpy()) if (idx + 1) % stats_per_batch == 0: running_loss /= stats_per_batch accuracy = num_correct / num_examples print(' Batch {} / {}: loss {:.4f}, accuracy {:.4f}'.format( idx + 1, num_batches, running_loss, accuracy)) if (torch.sum(labels.long() == 0).item() > 0) and (torch.sum(labels.long() == 1).item() > 0): area = roc_auc_score(labels.detach().cpu().numpy(), scores.detach().cpu().numpy()) print(' ROC-AUC score: {:.4f}'.format(area)) running_loss = 0.0 num_correct, num_examples = 0, 0 total_loss /= num_batches total_accuracy = total_correct / total_examples print('Loss {:.4f}, accuracy {:.4f}'.format(total_loss, total_accuracy)) y_true = np.array(y_true).flatten() y_scores = np.array(y_scores).flatten() y_pred = np.array(y_pred).flatten() report = classification_report(y_true, y_pred) area = roc_auc_score(y_true, y_scores) print('ROC-AUC score: {:.4f}'.format(area)) print('Classification report\n', report) print('Finished validating.') print('--------------------------------') # Evaluate on test set. if config['load']: directory = os.path.join(os.path.dirname(os.getcwd()), 'trained_models') fname = utils.get_fname(config) path = os.path.join(directory, fname) model.load_state_dict(torch.load(path)) dataset_args = (config['task'], config['dataset'], config['dataset_path'], config['generate_neg_examples'], 'test', config['duplicate_examples'], config['repeat_examples'], config['num_layers'], config['self_loop'], config['normalize_adj']) dataset = utils.get_dataset(dataset_args) loader = DataLoader(dataset=dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=dataset.collate_wrapper) criterion = utils.get_criterion(config['task']) stats_per_batch = config['stats_per_batch'] num_batches = int(ceil(len(dataset) / config['batch_size'])) model.eval() print('--------------------------------') print('Computing ROC-AUC score for the test dataset after training.') running_loss, total_loss = 0.0, 0.0 num_correct, num_examples = 0, 0 total_correct, total_examples = 0, 0 y_true, y_scores, y_pred = [], [], [] for (idx, batch) in enumerate(loader): edges, features, node_layers, mappings, rows, labels = batch features, labels = features.to(device), labels.to(device) out = model(features, node_layers, mappings, rows) all_pairs = torch.mm(out, out.t()) scores = all_pairs[edges.T] loss = criterion(scores, labels.float()) running_loss += loss.item() total_loss += loss.item() predictions = (scores >= t).long() num_correct += torch.sum(predictions == labels.long()).item() total_correct += torch.sum(predictions == labels.long()).item() num_examples += len(labels) total_examples += len(labels) y_true.extend(labels.detach().cpu().numpy()) y_scores.extend(scores.detach().cpu().numpy()) y_pred.extend(predictions.detach().cpu().numpy()) if (idx + 1) % stats_per_batch == 0: running_loss /= stats_per_batch accuracy = num_correct / num_examples print(' Batch {} / {}: loss {:.4f}, accuracy {:.4f}'.format( idx + 1, num_batches, running_loss, accuracy)) if (torch.sum(labels.long() == 0).item() > 0) and (torch.sum(labels.long() == 1).item() > 0): area = roc_auc_score(labels.detach().cpu().numpy(), scores.detach().cpu().numpy()) print(' ROC-AUC score: {:.4f}'.format(area)) running_loss = 0.0 num_correct, num_examples = 0, 0 total_loss /= num_batches total_accuracy = total_correct / total_examples print('Loss {:.4f}, accuracy {:.4f}'.format(total_loss, total_accuracy)) y_true = np.array(y_true).flatten() y_scores = np.array(y_scores).flatten() y_pred = np.array(y_pred).flatten() report = classification_report(y_true, y_pred) area = roc_auc_score(y_true, y_scores) print('ROC-AUC score: {:.4f}'.format(area)) print('Classification report\n', report) print('Finished testing.') print('--------------------------------')
logger = comet_ml.Experiment( api_key=api_key, project_name="sim_real", auto_metric_logging=True, auto_param_logging=True, ) if args.mixed_precision: print("Applied: Mixed Precision") tf.keras.mixed_precision.set_global_policy("mixed_float16") train_ds, test_ds = get_dataset(args) grid = image_grid(next(iter(train_ds))[0])[0] logger.log_image(grid.numpy()) model = get_model(args) criterion = get_criterion(args) optimizer = get_optimizer(args) lr_scheduler = get_lr_scheduler(args) early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=args.patience, restore_best_weights=True) experiment_name = get_experiment_name(args) logger.set_name(experiment_name) logger.log_parameters(vars(args)) with logger.train(): filename =f'{args.model_name}.hdf5' checkpoint = tf.keras.callbacks.ModelCheckpoint(filename, monitor='val_accuracy', mode='max', save_best_only=True, verbose=True) model.compile(loss=criterion, optimizer=optimizer, metrics=['accuracy']) if args.dry_run: print("[INFO] Turn off all callbacks") model.fit(train_ds, validation_data=test_ds, epochs=args.epochs, steps_per_epoch=2) else:
logger.info('n_params: {}'.format(n_params)) logger.info('first layer weight norm: {}'.format(norm_check)) if run_config['fp16'] and not run_config['use_amp']: model.half() for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() device = run_config['device'] if device is not 'cpu' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) logger.info('Done') train_criterion, test_criterion = utils.get_criterion( config['data_config']) # create optimizer if optim_config['no_weight_decay_on_bn']: params = [ { 'params': [ param for name, param in model.named_parameters() if 'bn' not in name ] }, { 'params': [ param for name, param in model.named_parameters() if 'bn' in name ],
config = utils.load_config(args.config) global_params = config["globals"] utils.set_seed(global_params["seed"]) device = utils.get_device(global_params) output_dir = global_params["output_dir"] data_conf = config["data"] if args.generate: for c in data_conf.values(): utils.generate_data(c) model = models.get_model(config).to() criterion = utils.get_criterion(config) optimizer = utils.get_optimizer(model, config) scheduler = utils.get_scheduler(optimizer, config) loaders = { phase: utils.get_loader(config, phase) for phase in ["train", "valid3", "valid", "valid12"] } runner = SupervisedRunner(device=device, input_key=["objects", "externals", "triplet"], input_target_key="targets") runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, scheduler=scheduler,