Пример #1
0
def main(cfg: DictConfig) -> None:
    """
    Run the code following a given configuration
    :param cfg: configuration file retrieved from hydra framework
    """
    main_utils.init(cfg)
    logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'],
                         logs_dir=cfg['main']['paths']['logs'])
    logger.write(OmegaConf.to_yaml(cfg))

    # Set seed for results reproduction
    main_utils.set_seed(cfg['main']['seed'])

    # Load dataset
    train_dataset = MyDataset(path=cfg['main']['paths']['train'])
    val_dataset = MyDataset(path=cfg['main']['paths']['validation'])

    train_loader = DataLoader(train_dataset,
                              cfg['train']['batch_size'],
                              shuffle=True,
                              num_workers=cfg['main']['num_workers'])
    eval_loader = DataLoader(val_dataset,
                             cfg['train']['batch_size'],
                             shuffle=True,
                             num_workers=cfg['main']['num_workers'])

    # Init model
    model = MyModel(num_hid=cfg['train']['num_hid'],
                    dropout=cfg['train']['dropout'])

    # TODO: Add gpus_to_use
    if cfg['main']['parallel']:
        model = torch.nn.DataParallel(model)

    if torch.cuda.is_available():
        model = model.cuda()

    logger.write(main_utils.get_model_string(model))

    # Run model
    train_params = train_utils.get_train_params(cfg)

    # Report metrics and hyper parameters to tensorboard
    metrics = train(model, train_loader, eval_loader, train_params, logger)
    hyper_parameters = main_utils.get_flatten_dict(cfg['train'])

    logger.report_metrics_hyper_params(hyper_parameters, metrics)
Пример #2
0
def train(model: nn.Module, train_loader: DataLoader, eval_loader: DataLoader, train_params: TrainParams,
          logger: TrainLogger) -> Metrics:
    """
    Training procedure. Change each part if needed (optimizer, loss, etc.)
    :param model:
    :param train_loader:
    :param eval_loader:
    :param train_params:
    :param logger:
    :return:
    """
    metrics = train_utils.get_zeroed_metrics_dict()
    best_eval_score = 0

    # Create optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=train_params.lr)

    # Create learning rate scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=train_params.lr_step_size,
                                                gamma=train_params.lr_gamma)

    for epoch in tqdm(range(train_params.num_epochs)):
        t = time.time()
        metrics = train_utils.get_zeroed_metrics_dict()

        for i, (x, y) in enumerate(train_loader):
            if torch.cuda.is_available():
                x = x.cuda()
                y = y.cuda()

            y_hat = model(x)

            loss = nn.functional.binary_cross_entropy_with_logits(y_hat, y)

            # Optimization step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate metrics
            metrics['total_norm'] += nn.utils.clip_grad_norm_(model.parameters(), train_params.grad_clip)
            metrics['count_norm'] += 1

            # NOTE! This function compute scores correctly only for one hot encoding representation of the logits
            batch_score = train_utils.compute_score_with_logits(y_hat, y.data).sum()
            metrics['train_score'] += batch_score.item()

            metrics['train_loss'] += loss.item() * x.size(0)

            # Report model to tensorboard
            if epoch == 0 and i == 0:
                logger.report_graph(model, x)

        # Learning rate scheduler step
        scheduler.step()

        # Calculate metrics
        metrics['train_loss'] /= len(train_loader.dataset)

        metrics['train_score'] /= len(train_loader.dataset)
        metrics['train_score'] *= 100

        norm = metrics['total_norm'] / metrics['count_norm']

        model.train(False)
        metrics['eval_score'], metrics['eval_loss'] = evaluate(model, eval_loader)
        model.train(True)

        epoch_time = time.time() - t
        logger.write_epoch_statistics(epoch, epoch_time, metrics['train_loss'], norm,
                                      metrics['train_score'], metrics['eval_score'])

        scalars = {'Accuracy/Train': metrics['train_score'],
                   'Accuracy/Validation': metrics['eval_score'],
                   'Loss/Train': metrics['train_loss'],
                   'Loss/Validation': metrics['eval_loss']}

        logger.report_scalars(scalars, epoch)

        if metrics['eval_score'] > best_eval_score:
            best_eval_score = metrics['eval_score']
            if train_params.save_model:
                logger.save_model(model, epoch, optimizer)

    return get_metrics(best_eval_score, metrics['eval_score'], metrics['train_loss'])
Пример #3
0
    def run(self):
        self.model.train()
        self.head.train()
        running_loss = 0.
        step = 0
        val_acc = 0.
        val_loss = 0.

        best_step = 0
        best_acc = float('Inf')
        if self.config.max_or_min == 'max':
            best_acc *= -1

        for epoch in range(self.config.epochs):
            train_logger = TrainLogger(self.config.batch_size, self.config.frequency_log)

            if epoch + 1 in self.config.reduce_lr and not self.config.lr_plateau:
                self.reduce_lr()

            for idx, data in enumerate(self.train_loader):
                imgs, labels = data
                imgs = imgs.to(self.config.device)
                labels = labels.to(self.config.device)

                self.optimizer.zero_grad()

                embeddings = self.model(imgs)

                if self.config.attribute == 'recognition':
                    outputs = self.head(embeddings, labels)
                else:
                    outputs = self.head(embeddings)

                if self.weights is not None:
                    loss = self.config.loss(outputs, labels, weight=self.weights)
                else:
                    loss = self.config.loss(outputs, labels)

                loss.backward()
                running_loss += loss.item()

                self.optimizer.step()

                if step % self.tensorboard_loss_every == 0:
                    loss_board = running_loss / self.tensorboard_loss_every
                    self.writer.add_scalar('train_loss', loss_board, step)
                    running_loss = 0.

                if step % self.evaluate_every == 0 and step != 0:
                    if self.config.val_source is not None:
                        val_acc, val_loss = self.evaluate(step)
                        self.model.train()
                        self.head.train()
                        best_acc, best_step = self.save_model(val_acc, best_acc, step, best_step)
                        print(f'Best accuracy: {best_acc:.5f} at step {best_step}')
                    else:
                        save_state(self.model, self.head, self.optimizer, self.config, 0, step)

                train_logger(epoch, self.config.epochs, idx, len(self.train_loader), loss.item())
                step += 1

            if self.config.lr_plateau:
                self.scheduler.step(val_acc)

            if self.config.early_stop:
                self.early_stop(val_acc)
                if self.early_stop.stop:
                    print("Early stopping model...")
                    break

        val_acc, val_loss = self.evaluate(step)
        best_acc = self.save_model(val_acc, best_acc, step, best_step)
        print(f'Best accuracy: {best_acc} at step {best_step}')
Пример #4
0
def main(cfg: DictConfig, preprocess_data=True, create_images_h5_file=True):
    """
    Run the code following a given configuration
    :param cfg: configuration file retrieved from hydra framework
    """
    main_utils.init(cfg)
    logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'],
                         logs_dir=cfg['main']['paths']['logs'])
    logger.write(OmegaConf.to_yaml(cfg))

    # Set seed for results reproduction
    main_utils.set_seed(cfg['main']['seed'])

    #------Only run 1 time in order to create the h5 file-----:
    # create h5 file with the images, separate files for train, val
    if create_images_h5_file:
        logger.write('--------creating vision files--------')
        start = time.time()
        vision_utils.create_vision_files(cfg)
        logger.write(f'time of creating images files: {time.time()-start}')

    if preprocess_data:
        logger.write('--------preprocess data--------')
        # Load dataset
        train_dataset = MyDataset(cfg, 'train', is_padding=True)

        w2idx, idx2w = train_dataset.w2idx, train_dataset.idx2w
        val_dataset = MyDataset(cfg, 'val', w2idx, idx2w, is_padding=True)

        # save a cPickle
        # with open(cfg['main']["paths"]['train_dataset'], 'wb') as f:
        #     cPickle.dump(train_dataset, f)
        # with open(cfg['main']["paths"]['val_dataset'], 'wb') as f:
        #     cPickle.dump(val_dataset, f)

        # save as torch pth
        train_dataset._save()
        val_dataset._save()

    else:
        logger.write("--------loading datasets--------")
        # load as cPickle
        # train_dataset = cPickle.load(open(cfg['main']["paths"]['train_dataset'], 'rb'))
        # val_dataset = cPickle.load(open(cfg['main']["paths"]['val_dataset'], 'rb'))
        # load as torch pth
        train_dataset = torch.load(cfg['main']["paths"]['train_dataset'])
        val_dataset = torch.load(cfg['main']["paths"]['val_dataset'])

    logger.write('--------create data loaders--------')
    train_loader = DataLoader(train_dataset,
                              cfg['train']['batch_size'],
                              shuffle=True,
                              num_workers=cfg['main']['num_workers'],
                              collate_fn=main_utils.collate_fn)
    val_loader = DataLoader(val_dataset,
                            cfg['train']['batch_size'],
                            shuffle=True,
                            num_workers=cfg['main']['num_workers'],
                            collate_fn=main_utils.collate_fn)

    # logger.write(f'len of train loader: {len(train_loader) * cfg["train"]["batch_size"]}, '
    #       f'len of val loader: {len(val_loader) * cfg["train"]["batch_size"]}')
    # 2127 val samples dont have answers, train num samples: 443760, val num samples: 214368

    # Init model
    logger.write(f'--------init model---------')
    max_q_len = train_loader.dataset.max_q_length
    num_ans = train_loader.dataset.num_of_ans
    q_name, v_name, vqa_name = cfg['main']['model_names'].values()

    for model_name in [
            'no_pretrain', 'pretrain_4_layers', 'pretrain_8_layers'
    ]:

        model = main_utils.init_models(q_name, v_name, vqa_name, cfg,
                                       max_q_len, num_ans, model_name).model

        # Add gpus_to_use in cfg- not relevant, we have 1 GPU
        if cfg['main']['parallel']:
            model = torch.nn.DataParallel(model)

        if torch.cuda.is_available():
            model = model.cuda()

        logger.write(main_utils.get_model_string(model))

        # Run model
        logger.write(f'--------train model---------')
        train_params = train_utils.get_train_params(cfg)

        # Report metrics and hyper parameters to tensorboard
        metrics = train(model, train_loader, val_loader, train_params, logger,
                        model_name)
        hyper_parameters = main_utils.get_flatten_dict(cfg['train'])

        logger.report_metrics_hyper_params(hyper_parameters, metrics)
Пример #5
0
def train(model: nn.Module, train_loader: DataLoader, eval_loader: DataLoader, train_params: TrainParams,
          logger: TrainLogger) -> Metrics:
    """
    Training procedure. Change each part if needed (optimizer, loss, etc.)
    :param model:
    :param train_loader:
    :param eval_loader:
    :param train_params:
    :param logger:
    :return:
    """
    metrics = train_utils.get_zeroed_metrics_dict()
    best_eval_score = 0

    # Create optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=train_params.lr)

    # Create learning rate scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=train_params.lr_step_size,
                                                gamma=train_params.lr_gamma)

    for epoch in (range(train_params.num_epochs)):
        print(f"#######epoch {epoch+1}##########")
        t = time.time()
        metrics = train_utils.get_zeroed_metrics_dict()
        bce_loss = nn.BCEWithLogitsLoss(reduction='sum')

        for i, (image, question, question_len, label) in tqdm(enumerate(train_loader),
                                                              disable=disable_tqdm, total=len(train_loader)):
            if torch.cuda.is_available():
                image = image.cuda()
                question = question.cuda()
                question_len = question_len.cuda()
                label = label.cuda()

            y_hat = model(image, question,question_len)
            y_hat_probs = nn.functional.log_softmax(y_hat)
            # target_probs = nn.functional.softmax(label)
            loss = bce_loss(y_hat, label)

            # Optimization step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate metrics
            metrics['total_norm'] += nn.utils.clip_grad_norm_(model.parameters(), train_params.grad_clip)
            metrics['count_norm'] += 1

            # Calculate accuracy
            batch_score = train_utils.compute_soft_accuracy(y_hat_probs, label)
            metrics['train_score'] += batch_score.item()

            metrics['train_loss'] += loss.item()

            # Report model to tensorboard
            if epoch == 0 and i == 0:
                logger.report_graph(model, (image, question, question_len))

        # Learning rate scheduler step
        scheduler.step()

        # Calculate metrics
        metrics['train_loss'] /= len(train_loader.dataset)

        metrics['train_score'] /= len(train_loader.dataset)
        metrics['train_score'] *= 100

        norm = metrics['total_norm'] / metrics['count_norm']

        model.train(False)
        metrics['eval_score'], metrics['eval_loss'] = evaluate(model, eval_loader)
        model.train(True)

        epoch_time = time.time() - t
        logger.write_epoch_statistics(epoch, epoch_time, metrics['train_loss'], norm,
                                      metrics['train_score'], metrics['eval_score'],metrics['eval_loss'])

        scalars = {'Accuracy/Train': metrics['train_score'],
                   'Accuracy/Validation': metrics['eval_score'],
                   'Loss/Train': metrics['train_loss'],
                   'Loss/Validation': metrics['eval_loss']}

        logger.report_scalars(scalars, epoch)

        if metrics['eval_score'] > best_eval_score:
            best_eval_score = metrics['eval_score']
            if train_params.save_model:
                logger.save_model(model, epoch, optimizer)

    return get_metrics(best_eval_score, metrics['eval_score'], metrics['train_loss'])
Пример #6
0
def main(cfg: DictConfig) -> None:
    """
    Run the code following a given configuration
    :param cfg: configuration file retrieved from hydra framework
    """
    main_utils.init(cfg)
    logger = TrainLogger(exp_name_prefix=cfg['main']['experiment_name_prefix'],
                         logs_dir=cfg['main']['paths']['logs'])
    logger.write(OmegaConf.to_yaml(cfg))

    # Set seed for results reproduction
    main_utils.set_seed(cfg['main']['seed'])

    # Load dataset
    path_image_train = '/datashare/train2014/COCO_train2014_'
    path_question_train = '/datashare/v2_OpenEnded_mscoco_train2014_questions.json'
    train_dataset = VQADataset(path_answers=cfg['main']['paths']['train'],
                               path_image=path_image_train,
                               path_questions=path_question_train)
    path_image_val = '/datashare/val2014/COCO_val2014_'
    path_question_val = '/datashare/v2_OpenEnded_mscoco_val2014_questions.json'
    val_dataset = VQADataset(path_answers=cfg['main']['paths']['validation'],
                             path_image=path_image_val,
                             path_questions=path_question_val,
                             word_dict=train_dataset.word_dict)

    train_loader = DataLoader(train_dataset,
                              cfg['train']['batch_size'],
                              shuffle=True,
                              num_workers=cfg['main']['num_workers'])
    eval_loader = DataLoader(val_dataset,
                             cfg['train']['batch_size'],
                             shuffle=True,
                             num_workers=cfg['main']['num_workers'])

    image_dim = train_dataset.pic_size
    output_dim = 2410  # possible answers
    model = VQAModel(batch_size=cfg['train']['batch_size'],
                     word_vocab_size=train_dataset.vocab_size,
                     lstm_hidden=cfg['train']['num_hid'],
                     output_dim=output_dim,
                     dropout=cfg['train']['dropout'],
                     word_embedding_dim=cfg['train']['word_embedding_dim'],
                     question_output_dim=cfg['train']['question_output_dim'],
                     image_dim=image_dim,
                     last_hidden_fc_dim=cfg['train']['last_hidden_fc_dim'])

    if cfg['main']['parallel']:
        model = torch.nn.DataParallel(model)

    if torch.cuda.is_available():
        model = model.cuda()

    logger.write(main_utils.get_model_string(model))

    # Run model
    train_params = train_utils.get_train_params(cfg)

    # Report metrics and hyper parameters to tensorboard
    metrics = train(model, train_loader, eval_loader, train_params, logger)
    hyper_parameters = main_utils.get_flatten_dict(cfg['train'])

    logger.report_metrics_hyper_params(hyper_parameters, metrics)