def predict(self): #Data valid_loader = self.dataset.get_data_loaders() #Model model = ResNetSimCLR(**self.config["model"]) if self.device == 'cuda': model = nn.DataParallel( model, device_ids=[i for i in range(self.config['gpu']['gpunum'])]) model = model.cuda() #print(model) model = self._load_pre_trained_weights(model) # validate the model if requested self._validate(model, valid_loader)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print(args.gpu) print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = ResNetSimCLR(base_model=args.arch, out_dim=args.out_dim).to(args.gpu) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.num_workers = int( (args.num_workers + ngpus_per_node - 1) / ngpus_per_node) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging #raise NotImplementedError("Only DistributedDataParallel is supported.") #else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. #raise NotImplementedError("Only DistributedDataParallel is supported.") # Data loader train_loader, train_sampler = data_loader(args.dataset, args.data_path, args.batch_size, args.num_workers, download=args.download, distributed=args.distributed, supervised=False) #optimizer = torch.optim.Adam(model.parameters(), 3e-4, weight_decay=args.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=0, last_epoch=-1) criterion = NTXentLoss(args.gpu, args.batch_size, args.temperature, True).cuda(args.gpu) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if apex_support and args.fp16_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O2', keep_batchnorm_fp32=True) cudnn.benchmark = True train(model, train_loader, train_sampler, criterion, optimizer, scheduler, args, ngpus_per_node)
# model = Encoder(out_dim=out_dim) model = ResNetSimCLR(base_model=config["base_convnet"], out_dim=out_dim) if config['continue_training']: checkpoints_folder = os.path.join('./runs', config['continue_training'], 'checkpoints') state_dict = torch.load(os.path.join(checkpoints_folder, 'model.pth')) model.load_state_dict(state_dict) print("Loaded pre-trained model with success.") train_gpu = torch.cuda.is_available() print("Is gpu available:", train_gpu) # moves the model parameters to gpu if train_gpu: model = model.cuda() criterion = torch.nn.CrossEntropyLoss(reduction='sum') optimizer = optim.Adam(model.parameters(), 3e-4) train_writer = SummaryWriter() similarity_func = get_similarity_function(use_cosine_similarity) megative_mask = (1 - torch.eye(2 * batch_size)).type(torch.bool) labels = (np.eye((2 * batch_size), 2 * batch_size - 1, k=-batch_size) + np.eye( (2 * batch_size), 2 * batch_size - 1, k=batch_size - 1)).astype(np.int) labels = torch.from_numpy(labels) softmax = torch.nn.Softmax(dim=-1) if train_gpu:
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=config['num_workers'], drop_last=True, shuffle=True) # model = Encoder(out_dim=out_dim) model = ResNetSimCLR(base_model=config["base_convnet"], out_dim=out_dim) train_gpu = torch.cuda.is_available() print("Is gpu available:", train_gpu) # moves the model parameters to gpu if train_gpu: model.cuda() criterion = torch.nn.CrossEntropyLoss(reduction='sum') optimizer = optim.Adam(model.parameters(), 3e-4) train_writer = SummaryWriter() sim_func_dim1, sim_func_dim2 = get_similarity_function(use_cosine_similarity) # Mask to remove positive examples from the batch of negative samples negative_mask = get_negative_mask(batch_size) n_iter = 0 for e in range(config['epochs']): for step, ((xis, xjs), _) in enumerate(train_loader):
def train(self): #Data train_loader, valid_loader = self.dataset.get_data_loaders() #Model model = ResNetSimCLR(**self.config["model"]) if self.device == 'cuda': model = nn.DataParallel(model, device_ids=[i for i in range(self.config['gpu']['gpunum'])]) #model = model.to(self.device) model = model.cuda() print(model) model = self._load_pre_trained_weights(model) each_epoch_steps = len(train_loader) total_steps = each_epoch_steps * self.config['train']['epochs'] warmup_steps = each_epoch_steps * self.config['train']['warmup_epochs'] scaled_lr = eval(self.config['train']['lr']) * self.batch_size / 256. optimizer = torch.optim.Adam( model.parameters(), scaled_lr, weight_decay=eval(self.config['train']['weight_decay'])) ''' optimizer = LARS(params=model.parameters(), lr=eval(self.config['train']['lr']), momentum=self.config['train']['momentum'], weight_decay=eval(self.config['train']['weight_decay'], eta=0.001, max_epoch=self.config['train']['epochs']) ''' # scheduler during warmup stage lambda1 = lambda epoch:epoch*1.0 / int(warmup_steps) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) if apex_support and self.config['train']['fp16_precision']: model, optimizer = amp.initialize(model, optimizer, opt_level='O2', keep_batchnorm_fp32=True) model_checkpoints_folder = os.path.join(self.writer.log_dir, 'checkpoints') # save config file _save_config_file(model_checkpoints_folder) n_iter = 0 valid_n_iter = 0 best_valid_loss = np.inf lr = eval(self.config['train']['lr']) end = time.time() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch_counter in range(self.config['train']['epochs']): model.train() for i, ((xis, xjs), _) in enumerate(train_loader): data_time.update(time.time() - end) optimizer.zero_grad() xis = xis.cuda() xjs = xjs.cuda() loss = self._step(model, xis, xjs, n_iter) #print("Loss: ",loss.data.cpu()) losses.update(loss.item(), 2 * xis.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{epoch}][{step}/{each_epoch_steps}] Loss {loss.val:.4f} Avg Loss {loss.avg:.4f} DataTime {datatime.val:.4f} BatchTime {batchtime.val:.4f} LR {lr})'.format(epoch=epoch_counter, step=i, each_epoch_steps=each_epoch_steps, loss=losses, datatime=data_time, batchtime=batch_time, lr=lr)) if n_iter % self.config['train']['log_every_n_steps'] == 0: self.writer.add_scalar('train_loss', loss, global_step=n_iter) if apex_support and self.config['train']['fp16_precision']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() n_iter += 1 #adjust lr if n_iter == warmup_steps: # scheduler after warmup stage scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps-warmup_steps, eta_min=0, last_epoch=-1) scheduler.step() lr = scheduler.get_lr()[0] self.writer.add_scalar('cosine_lr_decay', scheduler.get_lr()[0], global_step=n_iter) sys.stdout.flush() # validate the model if requested if epoch_counter % self.config['train']['eval_every_n_epochs'] == 0: valid_loss = self._validate(model, valid_loader) if valid_loss < best_valid_loss: # save the model weights best_valid_loss = valid_loss torch.save(model.state_dict(), os.path.join(model_checkpoints_folder, 'model.pth')) self.writer.add_scalar('validation_loss', valid_loss, global_step=valid_n_iter) valid_n_iter += 1