Exemplo n.º 1
0
    def train(self, train_loader):

        scaler = GradScaler(enabled=self.args.fp16_precision)

        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")

        for epoch_counter in range(self.args.epochs):
            for images, _ in tqdm(train_loader):
                
                
                
                images = torch.cat(images, dim=0)

                images = images.to(self.args.device)
                
                #print(images.size())

                with autocast(enabled=self.args.fp16_precision):
                    features = self.model(images)
                    #print(type(features))
                    #print(features)
                    logits, labels = self.info_nce_loss(features)
                    loss = self.criterion(logits, labels)

                self.optimizer.zero_grad()

                scaler.scale(loss).backward()

                scaler.step(self.optimizer)
                scaler.update()

                if n_iter % self.args.log_every_n_steps == 0:
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    self.writer.add_scalar('loss', loss, global_step=n_iter)
                    self.writer.add_scalar('acc/top1', top1[0], global_step=n_iter)
                    self.writer.add_scalar('acc/top5', top5[0], global_step=n_iter)
                    self.writer.add_scalar('learning_rate', self.scheduler.get_lr()[0], global_step=n_iter)

                n_iter += 1

            # warmup for the first 10 epochs
            if epoch_counter >= 10:
                self.scheduler.step()
            logging.debug(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}")

        logging.info("Training has finished.")
        # save model checkpoints
        checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
        save_checkpoint({
            'epoch': self.args.epochs,
            'arch': self.args.arch,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
        }, is_best=False, filename=os.path.join(self.writer.log_dir, checkpoint_name))
        logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.")
Exemplo n.º 2
0
    def process_results(self,
                        true: list,
                        predicted: list,
                        name=None,
                        **plot_args):

        errs = dict()
        for out, out_name in enumerate(self.out_cols):

            t = true[out]
            p = predicted[out]

            if np.isnan(t).sum() > 0:
                mask = np.invert(np.isnan(t))
                t = t[mask]
                p = p[mask]

            errors = FindErrors(t, p)
            errs[out_name + '_errors'] = errors.calculate_all()
            errs[out_name + '_stats'] = errors.stats()

            plot_results(t,
                         p,
                         name=os.path.join(self.path, name + out_name),
                         **plot_args)

        save_config_file(self.path, errors=errs, name=name)

        return
Exemplo n.º 3
0
    def save_config(self, history: dict):

        test_indices = np.array(
            self.test_indices,
            dtype=int).tolist() if self.test_indices is not None else None
        train_indices = np.array(
            self.train_indices,
            dtype=int).tolist() if self.train_indices is not None else None

        save_config_file(indices={
            'test_indices': test_indices,
            'train_indices': train_indices
        },
                         path=self.path)

        config = dict()
        config['min_val_loss'] = int(np.min(
            history['val_loss'])) if 'val_loss' in history else None
        config['min_loss'] = int(np.min(
            history['loss'])) if 'val_loss' in history else None
        config['nn_config'] = self.nn_config
        config['data_config'] = self.data_config
        config['intervals'] = self.intervals
        config['method'] = self.method

        save_config_file(config=config, path=self.path)
        return config
Exemplo n.º 4
0
    def save_errors(self, errors, neg_predictions):

        config = OrderedDict()
        config['errors'] = errors
        # neg predictions are found after `predict` method so saving now and not in config file.
        config['neg_predictions'] = neg_predictions

        save_config_file(errors=config, path=self.path)

        return
Exemplo n.º 5
0
    def save_config(self):

        config = dict()
        config['min_val_loss'] = np.min(
            self.k_model.history.history['val_loss']) if 'val_loss' in self.k_model.history.history else None
        config['min_loss'] = np.min(
            self.k_model.history.history['loss']) if 'val_loss' in self.k_model.history.history else None
        config['nn_config'] = self.nn_config
        config['data_config'] = self.data_config
        config['test_indices'] = np.array(self.test_indices, dtype=int) if self.test_indices is not None else None
        config['test_indices'] = np.array(self.train_indices, dtype=int) if self.train_indices is not None else None
        config['intervals'] = self.intervals
        config['method'] = self.method

        save_config_file(config=config, path=self.path)
        return config
Exemplo n.º 6
0
    def save_config(self):

        config = OrderedDict()
        config[
            'comment'] = 'use point source pollutant data along with best model from grid search'
        config['nn_config'] = self.nn_config
        config['data_config'] = self.data_config
        config['test_sample_idx'] = 'test_idx'
        config['start_time'] = self.nn_config[
            'start_time'] if 'start_time' in self.nn_config else " "
        config['end_time'] = self.nn_config[
            'end_time'] if 'end_time' in self.nn_config else " "
        config["saved_epochs"] = self.saved_epochs
        config['intervals'] = self.intervals
        config['args'] = self.args
        config['train_time'] = self.nn_config[
            'train_duration'] if 'train_duration' in self.nn_config else " "
        config['final_comment'] = """ """

        save_config_file(config=config, path=self.path)
        return config
Exemplo n.º 7
0
    def train(self, train_loader):

        scaler = GradScaler(enabled=self.args.fp16_precision)
        # save config file
        save_config_file(self.writer.log_dir, self.args)
        checkpoint = torch.load(
            '/scratch/gg2501/simclr/checkpoint_0036.pth.tar')
        state_dict = checkpoint['state_dict']
        #for k in list(state_dict.keys()):
        #    if k.startswith('module.backbone.'):
        #        state_dict['backbone.' + k[len("module.backbone."):]] = state_dict[k]
        #del state_dict[k]
        self.model.load_state_dict(state_dict)

        self.optimizer.load_state_dict(checkpoint['optimizer'])
        n_iter = 0
        min_loss = 100
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")
        for epoch_counter in range(self.args.epochs):
            for images, _ in tqdm(train_loader):
                images = torch.cat(images, dim=0)

                images = images.to(self.args.device)

                with autocast(enabled=self.args.fp16_precision):
                    features = self.model(images)
                    logits, labels = self.info_nce_loss(features)
                    loss = self.criterion(logits, labels)

                self.optimizer.zero_grad()

                scaler.scale(loss).backward()

                scaler.step(self.optimizer)
                scaler.update()

                if n_iter % self.args.log_every_n_steps == 0:
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    self.writer.add_scalar('loss', loss, global_step=n_iter)
                    self.writer.add_scalar('acc/top1',
                                           top1[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('acc/top5',
                                           top5[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('learning_rate',
                                           self.scheduler.get_lr()[0],
                                           global_step=n_iter)

                if n_iter % self.args.log_every_n_steps == 0:
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    print(n_iter, min_loss, loss.item(), top1[0].item(),
                          top5[0].item())
                    logging.debug(
                        f"Epoch: {epoch_counter}\tIter: {n_iter}\tLoss: {loss}\tTop1 accuracy: {top1[0]} \tTop5 accuracy: {top5[0]}"
                    )
                n_iter += 1

            if min_loss > loss.item(
            ) or epoch_counter % self.args.checkpoint_step == self.args.checkpoint_step - 1:
                # save model checkpoints
                checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(
                    epoch_counter)
                print(loss.item(), epoch_counter)
                save_checkpoint(
                    {
                        'epoch': epoch_counter,
                        'arch': self.args.arch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                        # }, is_best=False, filename=os.path.join(self.writer.log_dir, checkpoint_name))
                    },
                    is_best=False,
                    filename=os.path.join(self.args.checkpoint_dir,
                                          checkpoint_name))
                logging.info(
                    f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
                )
            # warmup for the first 10 epochs
            min_loss = min(min_loss, loss.item())
            if epoch_counter >= 10:
                self.scheduler.step()
            logging.debug(
                f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}"
            )

        logging.info("Training has finished.")
Exemplo n.º 8
0
    def train(self, train_loader):

        scaler = GradScaler(enabled=self.args.fp16_precision)

        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {not self.args.disable_cuda}.")

        top_loss = 1e8
        top_acc = 0

        best_model_state_dict = self.model.state_dict()
        for epoch_counter in range(self.args.epochs):
            for images, _ in tqdm(train_loader):
                images = torch.cat(images, dim=0)

                images = images.to(self.args.device)

                with autocast(enabled=self.args.fp16_precision):
                    features = self.model(images)
                    logits, labels = self.info_nce_loss(features)
                    loss = self.criterion(logits, labels)

                # print(logits.shape)
                # print(labels.shape)

                self.optimizer.zero_grad()

                scaler.scale(loss).backward()

                scaler.step(self.optimizer)
                scaler.update()

                if n_iter % self.args.log_every_n_steps == 0:
                    # print(labels)
                    # print(logits)
                    # print("Writing")
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    self.writer.add_scalar('loss', loss, global_step=n_iter)
                    self.writer.add_scalar('acc/top1',
                                           top1[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('acc/top5',
                                           top5[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('learning_rate',
                                           self.scheduler.get_lr()[0],
                                           global_step=n_iter)

                    grid_1 = torchvision.utils.make_grid(images[:4])
                    # self.writer.add_image('data_1', grid_1, global_step=n_iter)
                    # self.writer.add_text('data_1', str(labels[:4].cpu().numpy()), global_step=n_iter)

                    grid_2 = torchvision.utils.make_grid(
                        images[self.args.batch_size:self.args.batch_size + 4])
                    # self.writer.add_image('data 2', grid_2, global_step=n_iter)
                    # self.writer.add_text('data_2', str(labels[self.args.batch_size:self.args.batch_size+4].cpu().numpy()), global_step=n_iter)
                n_iter += 1

            # warmup for the first 10 epochs
            if epoch_counter >= 10:
                self.scheduler.step()
            logging.debug(
                f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}"
            )

            if top_acc < top1[0] and top_loss > loss:
                # save model checkpoints
                checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(
                    epoch_counter)
                save_checkpoint(
                    {
                        'epoch': self.args.epochs,
                        'arch': self.args.arch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                    },
                    is_best=False,
                    filename=os.path.join(self.writer.log_dir,
                                          checkpoint_name))
                logging.info(
                    f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
                )

                best_model_state_dict = deepcopy(self.model.state_dict())

                top_acc = top1[0]
                top_loss = loss

        logging.info("Training has finished.")
        # save model checkpoints
        checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
        save_checkpoint(
            {
                'epoch': self.args.epochs,
                'arch': self.args.arch,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
            },
            is_best=False,
            filename=os.path.join(self.writer.log_dir, checkpoint_name))
        logging.info(
            f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
        )

        self.model.load_state_dict(best_model_state_dict)
        return self.model
Exemplo n.º 9
0
 def train(self, train_loader):
     start = time.time()
     scaler = GradScaler(enabled=self.args.fp16_precision)
     save_config_file(self.writer.log_dir, self.args)
     n_iter = 0
     logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
     logging.info(
         f"Training with gpu. args.disable_cuda flag: {self.args.disable_cuda}."
     )
     for epoch_counter in range(self.args.epochs):
         for images, _ in tqdm(train_loader):  #dont need true labels
             images = torch.cat(
                 images, dim=0
             )  #input has two augmmented version of same images in list : [images[0],images[1]]. images[0].shape = torch.Size([256, 3, 96, 96]) ; images[1].shape = torch.Size([256, 3, 96, 96])
             images = images.to(self.args.device)  #2*batch_size,3,96,96
             with autocast(enabled=self.args.fp16_precision):
                 features = self.model(
                     images)  #features.shape = [2*batch_size, out_dim]
                 logits, labels = self.info_nce_loss(
                     features
                 )  #logits.shape = (512, 511) ; labels.shape=torch.Size([512])
                 loss = self.criterion(
                     logits, labels
                 )  #loss is a single number. https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
             self.optimizer.zero_grad()
             scaler.scale(loss).backward()
             scaler.step(self.optimizer)
             scaler.update()
             if n_iter % self.args.log_every_n_steps == 0:
                 top1, top5 = accuracy(
                     logits, labels, topk=(1, 5)
                 )  #top1 accuracy could have been summed over entire epoch rather than recorded every minibatch. too late to change. later
                 self.writer.add_scalar('loss', loss, global_step=n_iter)
                 self.writer.add_scalar('acc/top1',
                                        top1[0],
                                        global_step=n_iter)
                 self.writer.add_scalar('acc/top5',
                                        top5[0],
                                        global_step=n_iter)
                 self.writer.add_scalar('learning_rate',
                                        self.scheduler.get_lr()[0],
                                        global_step=n_iter)
             n_iter += 1
         if epoch_counter >= 10:  # warmup for the first 10 epochs
             self.scheduler.step()
         logging.debug(
             f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}"
         )
     logging.info("Training has finished.")
     checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
     save_checkpoint(
         {
             'epoch': self.args.epochs,
             'arch': self.args.arch,
             'state_dict': self.model.state_dict(),
             'optimizer': self.optimizer.state_dict(),
         },
         is_best=False,
         filename=os.path.join(self.writer.log_dir, checkpoint_name))
     logging.info(
         f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
     )
     end = time.time()
     logging.info(f"Runtime is  {end-start}.")
Exemplo n.º 10
0
    def train(self, train_loader):

        scaler = GradScaler(enabled=self.args.fp16_precision)

        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")

        model_name = f"{type(self.model).__name__}"
        with open(f"./{model_name}_arch.txt", "w") as f:
            f.write(str(self.model))
        self.run[f"config/model/{model_name}_arch"].upload(
            f"./{model_name}_arch.txt")
        self.run["config/optimizer"] = type(self.optimizer).__name__

        for epoch_counter in range(self.args.epochs):
            for images, _ in tqdm(train_loader):
                images = torch.cat(images, dim=0)

                images = images.to(self.args.device)

                with autocast(enabled=self.args.fp16_precision):
                    features = self.model(images)
                    logits, labels = self.info_nce_loss(features)
                    loss = self.criterion(logits, labels)

                self.optimizer.zero_grad()

                scaler.scale(loss).backward()

                scaler.step(self.optimizer)
                scaler.update()

                if n_iter % self.args.log_every_n_steps == 0:
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    self.writer.add_scalar("loss", loss, global_step=n_iter)
                    self.writer.add_scalar("acc/top1",
                                           top1[0],
                                           global_step=n_iter)
                    self.writer.add_scalar("acc/top5",
                                           top5[0],
                                           global_step=n_iter)
                    self.writer.add_scalar("learning_rate",
                                           self.scheduler.get_lr()[0],
                                           global_step=n_iter)

                    self.run["metrics/acc"].log(top1[0])
                    self.run["metrics/loss"].log(loss)
                    self.run["metrics/lr"].log(self.scheduler.get_lr()[0])

                n_iter += 1

            # warmup for the first 10 epochs
            if epoch_counter >= 10:
                self.scheduler.step()
            logging.debug(
                f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}"
            )

        logging.info("Training has finished.")
        # save model checkpoints
        checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs)
        save_checkpoint(
            {
                "epoch": self.args.epochs,
                "arch": self.args.arch,
                "state_dict": self.model.state_dict(),
                "optimizer": self.optimizer.state_dict(),
            },
            is_best=False,
            filename=os.path.join(self.writer.log_dir, checkpoint_name),
        )
        logging.info(
            f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
        )

        self.run["checkpoints"].upload(
            File(os.path.join(self.writer.log_dir, checkpoint_name)))
        self.run.stop()
Exemplo n.º 11
0
    def train(self, loaders):

        scaler = GradScaler(enabled=self.args.fp16_precision)

        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")

        for epoch_counter in range(self.args.epochs):
            for phase in ["train", "val"]:
                if phase == "train":
                    self.model.train(True)
                else:
                    self.model.train(False)
                
                running_loss = 0.0
                #running_1acc = 0.0
                #running_5acc = 0.0

                for images1, images2, caption_encoding in tqdm(loaders[phase]):
                    if phase == "train":
                        loss = self.forward(images1, images2, caption_encoding)

                        self.optimizer.zero_grad()

                        scaler.scale(loss).backward()

                        scaler.step(self.optimizer)
                        scaler.update()
                    else:
                        with torch.no_grad():
                            loss = self.forward(images1, images2, caption_encoding)

                    #top1, top5 = accuracy(logits, labels, topk=(1, 5))

                    running_loss += loss
                    #running_1acc += top1[0]
                    #running_5acc += top5[0]

                loss = running_loss / len(loaders[phase])
                #acc1 = running_1acc / len(loaders[phase])
                #acc5 = running_5acc / len(loaders[phase])

                self.writer.add_scalar('loss/' + phase, loss, global_step=epoch_counter)
                #self.writer.add_scalar('top1/' + phase, acc1, global_step=epoch_counter)
                #self.writer.add_scalar('top5/' + phase, acc5, global_step=epoch_counter)

                # warmup for the first 10 epochs
                if epoch_counter >= 10:
                    self.scheduler.step()
                logging.debug(f"Epoch: {epoch_counter}\tLoss: {loss}")
            checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(epoch_counter)
            save_checkpoint({
                'epoch': epoch_counter,
                'arch': self.args.arch,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
            }, is_best=False, filename=os.path.join(self.writer.log_dir, checkpoint_name))

        logging.info("Training has finished.")
        # save model checkpoints
        logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.")
Exemplo n.º 12
0
    def train(self, train_loader):
        if apex_support and self.args.fp16_precision:
            logging.debug("Using apex for fp16 precision training.")
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                opt_level='O2',
                keep_batchnorm_fp32=True)
        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(
            f"Start LocalAggregation training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")

        min_loss = float('inf')
        for epoch_counter in range(self.args.epochs):
            epoch_loss = AverageMeter()
            for batch_i, (indices, images, _) in enumerate(tqdm(train_loader)):
                images = images.to(
                    self.args.device)  # torch.Size([batch_size, 3, 32, 32])

                features = self.model(
                    images)  # torch.Size([batch_size, out_dim])

                loss, new_data_memory = self.loss_fn(indices, features)
                epoch_loss.update(loss.data)

                self.optimizer.zero_grad()
                if apex_support and self.args.fp16_precision:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                self.optimizer.step()

                if n_iter % self.args.log_every_n_steps == 0:
                    self.writer.add_scalar('loss', loss, global_step=n_iter)

                n_iter += 1

                with torch.no_grad():
                    self.memory_bank.update(indices, new_data_memory)

                    if self.first_iteration_kmeans or batch_i % self.args.kmeans_freq == 0:

                        if self.first_iteration_kmeans:
                            self.first_iteration_kmeans = False

                        # get kmeans clustering (update our saved clustering)
                        k = [
                            self.args.kmeans_k
                            for _ in range(self.args.n_kmeans)
                        ]
                        self.cluster_label = compute_clusters(
                            k, self.memory_bank.bank, self.args.gpu_index)

            if epoch_counter > 10 and min_loss > epoch_loss.avg:
                min_loss = epoch_loss.avg
                checkpoint_name = 'best_model.pth.tar'
                save_checkpoint(
                    {
                        'epoch': self.args.epochs,
                        'arch': self.args.arch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                    },
                    filename=os.path.join(self.writer.log_dir,
                                          checkpoint_name))
                logging.info(
                    f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
                )
            logging.debug(f"Epoch: {epoch_counter}\tLoss: {epoch_loss.avg}")
        logging.info("LocalAggregation Training has finished.")
Exemplo n.º 13
0
 def train(self, train_loader, test_loader):
     start = time.time()
     scaler = GradScaler(enabled=self.args.fp16_precision
                         )  #todo: add autocast below as needed
     save_config_file(self.writer.log_dir, self.args)
     save_config_file(self.writer.log_dir, self.config)
     logging.info(f"Start training for {self.args.epochs} epochs.")
     logging.info(
         f"Training with gpu. args.disable_cuda flag: {self.args.disable_cuda}."
     )
     for epoch in range(self.args.epochs):
         top1_train_accuracy = 0  #train score per epoch
         for counter, (x_batch, y_batch) in enumerate(tqdm(train_loader)):
             x_batch = x_batch.to(self.args.device)
             y_batch = y_batch.to(self.args.device)
             logits = self.model(x_batch)
             loss = self.criterion(logits, y_batch)
             top1 = accuracy(logits, y_batch, topk=(1, ))
             top1_train_accuracy += top1[0]
             self.optimizer.zero_grad()
             loss.backward()
             self.optimizer.step()
         top1_train_accuracy /= (counter + 1)
         top1_accuracy = 0  #test score per epoch
         top5_accuracy = 0
         for counter, (x_batch, y_batch) in enumerate(test_loader):
             x_batch = x_batch.to(self.args.device)
             y_batch = y_batch.to(self.args.device)
             logits = self.model(x_batch)
             top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
             top1_accuracy += top1[0]
             top5_accuracy += top5[0]
         top1_accuracy /= (counter + 1)
         top5_accuracy /= (counter + 1)
         self.writer.add_scalar('train_loss', loss, global_step=epoch)
         self.writer.add_scalar('acc/top1 train',
                                top1_train_accuracy.item(),
                                global_step=epoch)
         self.writer.add_scalar('acc/top1 test',
                                top1_accuracy.item(),
                                global_step=epoch)
         self.writer.add_scalar('learning_rate',
                                self.scheduler.get_lr()[0],
                                global_step=epoch)
         logging.debug(
             f"Epoch: {epoch}\tTrain Loss: {loss}\tTop1 train accuracy: {top1_train_accuracy.item()}\tTop1 test accuracy: {top1_accuracy.item()}"
         )
     logging.info("Training has finished.")
     checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
     save_checkpoint(
         {
             'epoch': self.args.epochs,
             'arch': self.config.arch,
             'state_dict': self.model.state_dict(),
             'optimizer': self.optimizer.state_dict(),
         },
         is_best=False,
         filename=os.path.join(self.writer.log_dir, checkpoint_name))
     logging.info(
         f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
     )
     end = time.time()
     logging.info(f"Runtime is  {end-start}.")
Exemplo n.º 14
0
    def train(self, train_loader):
        if apex_support and self.args.fp16_precision:
            logging.debug("Using apex for fp16 precision training.")
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                opt_level='O2',
                keep_batchnorm_fp32=True)
        # save config file
        save_config_file(self.writer.log_dir, self.args)

        n_iter = 0
        logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
        logging.info(f"Training with gpu: {self.args.disable_cuda}.")

        min_loss = float('inf')
        for epoch_counter in range(self.args.epochs):
            epoch_loss = AverageMeter()
            for images, _ in tqdm(train_loader):
                # images: list[tensor, tensor]
                images = torch.cat(
                    images, dim=0)  # torch.Size([batch_size*2, 3, 32, 32])

                images = images.to(self.args.device)

                features = self.model(images)  # torch.Size([batch_size, 128])

                logits, labels = self.info_nce_loss(features)
                loss = self.criterion(logits, labels)
                epoch_loss.update(loss.data)

                self.optimizer.zero_grad()
                if apex_support and self.args.fp16_precision:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                self.optimizer.step()

                if n_iter % self.args.log_every_n_steps == 0:
                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                    self.writer.add_scalar('loss', loss, global_step=n_iter)
                    self.writer.add_scalar('acc/top1',
                                           top1[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('acc/top5',
                                           top5[0],
                                           global_step=n_iter)
                    self.writer.add_scalar('learning_rate',
                                           self.scheduler.get_lr()[0],
                                           global_step=n_iter)

                n_iter += 1

            # warmup for the first 10 epochs
            if epoch_counter >= 10:
                self.scheduler.step()
            if epoch_counter > 10 and min_loss > epoch_loss.avg:
                # save model checkpoints
                checkpoint_name = 'best_checkpoint.pth.tar'
                save_checkpoint(
                    {
                        'epoch': self.args.epochs,
                        'arch': self.args.arch,
                        'state_dict': self.model.state_dict(),
                        'optimizer': self.optimizer.state_dict(),
                    },
                    filename=os.path.join(self.writer.log_dir,
                                          checkpoint_name))
                logging.info(
                    f"Model checkpoint and metadata has been saved at {self.writer.log_dir}."
                )

            logging.debug(
                f"Epoch: {epoch_counter}\tLoss: {epoch_loss.avg}\tTop1 accuracy: {top1[0]}"
            )

        logging.info("SimCLR Training has finished.")