Exemplo n.º 1
0
    def predict(self):
        #Data
        valid_loader = self.dataset.get_data_loaders()

        #Model
        model = ResNetSimCLR(**self.config["model"])
        if self.device == 'cuda':
            model = nn.DataParallel(
                model,
                device_ids=[i for i in range(self.config['gpu']['gpunum'])])
        model = model.cuda()
        #print(model)
        model = self._load_pre_trained_weights(model)

        # validate the model if requested
        self._validate(model, valid_loader)
Exemplo n.º 2
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print(args.gpu)
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))

    model = ResNetSimCLR(base_model=args.arch,
                         out_dim=args.out_dim).to(args.gpu)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.num_workers = int(
                (args.num_workers + ngpus_per_node - 1) / ngpus_per_node)
            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        #raise NotImplementedError("Only DistributedDataParallel is supported.")
    #else:
    # AllGather implementation (batch shuffle, queue update, etc.) in
    # this code only supports DistributedDataParallel.
    #raise NotImplementedError("Only DistributedDataParallel is supported.")

    # Data loader
    train_loader, train_sampler = data_loader(args.dataset,
                                              args.data_path,
                                              args.batch_size,
                                              args.num_workers,
                                              download=args.download,
                                              distributed=args.distributed,
                                              supervised=False)

    #optimizer = torch.optim.Adam(model.parameters(), 3e-4, weight_decay=args.weight_decay)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=args.epochs,
                                                           eta_min=0,
                                                           last_epoch=-1)

    criterion = NTXentLoss(args.gpu, args.batch_size, args.temperature,
                           True).cuda(args.gpu)

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if apex_support and args.fp16_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O2',
                                          keep_batchnorm_fp32=True)

    cudnn.benchmark = True

    train(model, train_loader, train_sampler, criterion, optimizer, scheduler,
          args, ngpus_per_node)
Exemplo n.º 3
0
# model = Encoder(out_dim=out_dim)
model = ResNetSimCLR(base_model=config["base_convnet"], out_dim=out_dim)

if config['continue_training']:
    checkpoints_folder = os.path.join('./runs', config['continue_training'],
                                      'checkpoints')
    state_dict = torch.load(os.path.join(checkpoints_folder, 'model.pth'))
    model.load_state_dict(state_dict)
    print("Loaded pre-trained model with success.")

train_gpu = torch.cuda.is_available()
print("Is gpu available:", train_gpu)

# moves the model parameters to gpu
if train_gpu:
    model = model.cuda()

criterion = torch.nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), 3e-4)

train_writer = SummaryWriter()

similarity_func = get_similarity_function(use_cosine_similarity)

megative_mask = (1 - torch.eye(2 * batch_size)).type(torch.bool)
labels = (np.eye((2 * batch_size), 2 * batch_size - 1, k=-batch_size) + np.eye(
    (2 * batch_size), 2 * batch_size - 1, k=batch_size - 1)).astype(np.int)
labels = torch.from_numpy(labels)
softmax = torch.nn.Softmax(dim=-1)

if train_gpu:
Exemplo n.º 4
0
train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          num_workers=config['num_workers'],
                          drop_last=True,
                          shuffle=True)

# model = Encoder(out_dim=out_dim)
model = ResNetSimCLR(base_model=config["base_convnet"], out_dim=out_dim)

train_gpu = torch.cuda.is_available()
print("Is gpu available:", train_gpu)

# moves the model parameters to gpu
if train_gpu:
    model.cuda()

criterion = torch.nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), 3e-4)

train_writer = SummaryWriter()

sim_func_dim1, sim_func_dim2 = get_similarity_function(use_cosine_similarity)

# Mask to remove positive examples from the batch of negative samples
negative_mask = get_negative_mask(batch_size)

n_iter = 0
for e in range(config['epochs']):
    for step, ((xis, xjs), _) in enumerate(train_loader):
Exemplo n.º 5
0
    def train(self):
        #Data
        train_loader, valid_loader = self.dataset.get_data_loaders()

        #Model
        model = ResNetSimCLR(**self.config["model"])
        if self.device == 'cuda':
            model = nn.DataParallel(model, device_ids=[i for i in range(self.config['gpu']['gpunum'])])
        #model = model.to(self.device)
        model = model.cuda()
        print(model)
        model = self._load_pre_trained_weights(model)
        
        each_epoch_steps = len(train_loader)
        total_steps = each_epoch_steps * self.config['train']['epochs'] 
        warmup_steps = each_epoch_steps * self.config['train']['warmup_epochs']
        scaled_lr = eval(self.config['train']['lr']) * self.batch_size / 256.

        optimizer = torch.optim.Adam(
                     model.parameters(), 
                     scaled_lr, 
                     weight_decay=eval(self.config['train']['weight_decay']))
       
        '''
        optimizer = LARS(params=model.parameters(),
                     lr=eval(self.config['train']['lr']),
                     momentum=self.config['train']['momentum'],
                     weight_decay=eval(self.config['train']['weight_decay'],
                     eta=0.001,
                     max_epoch=self.config['train']['epochs'])
        '''

        # scheduler during warmup stage
        lambda1 = lambda epoch:epoch*1.0 / int(warmup_steps)
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

        if apex_support and self.config['train']['fp16_precision']:
            model, optimizer = amp.initialize(model, optimizer,
                                              opt_level='O2',
                                              keep_batchnorm_fp32=True)

        model_checkpoints_folder = os.path.join(self.writer.log_dir, 'checkpoints')

        # save config file
        _save_config_file(model_checkpoints_folder)

        n_iter = 0
        valid_n_iter = 0
        best_valid_loss = np.inf
        lr = eval(self.config['train']['lr']) 

        end = time.time()
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        
        for epoch_counter in range(self.config['train']['epochs']):
            model.train()
            for i, ((xis, xjs), _) in enumerate(train_loader):
                data_time.update(time.time() - end)
                optimizer.zero_grad()

                xis = xis.cuda()
                xjs = xjs.cuda()

                loss = self._step(model, xis, xjs, n_iter)

                #print("Loss: ",loss.data.cpu())
                losses.update(loss.item(), 2 * xis.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()
                print('Epoch: [{epoch}][{step}/{each_epoch_steps}] Loss {loss.val:.4f} Avg Loss {loss.avg:.4f} DataTime {datatime.val:.4f} BatchTime {batchtime.val:.4f} LR {lr})'.format(epoch=epoch_counter, step=i, each_epoch_steps=each_epoch_steps, loss=losses, datatime=data_time, batchtime=batch_time, lr=lr))

                if n_iter % self.config['train']['log_every_n_steps'] == 0:
                    self.writer.add_scalar('train_loss', loss, global_step=n_iter)

                if apex_support and self.config['train']['fp16_precision']:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                optimizer.step()
                n_iter += 1

                #adjust lr
                if n_iter == warmup_steps:
                    # scheduler after warmup stage
                    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps-warmup_steps, eta_min=0, last_epoch=-1)
                scheduler.step()
                lr = scheduler.get_lr()[0]
                self.writer.add_scalar('cosine_lr_decay', scheduler.get_lr()[0], global_step=n_iter)
                sys.stdout.flush()

            # validate the model if requested
            if epoch_counter % self.config['train']['eval_every_n_epochs'] == 0:
                valid_loss = self._validate(model, valid_loader)
                if valid_loss < best_valid_loss:
                    # save the model weights
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), os.path.join(model_checkpoints_folder, 'model.pth'))

                self.writer.add_scalar('validation_loss', valid_loss, global_step=valid_n_iter)
                valid_n_iter += 1