Exemplo n.º 1
0
    def __init__(self, cfg, split='train', val_fold=0, transforms=None):
        self.data_root = cfg['dataset']['root']
        task = cfg['dataset']['task']
        self.transforms = transforms

        # load folds from json file
        with open(os.path.join(self.data_root, 'folds.json'), 'r') as f:
            folds = json.load(f)
        if split == 'train':
            ids = [v for k, v in folds.items() if int(k) != val_fold]
            ids = list(itertools.chain.from_iterable(ids))
        elif split == 'val':
            ids = [v for k, v in folds.items() if int(k) == val_fold]
            ids = ids[0]
        else:
            ids = None

        if split == 'train' or split == 'val':
            self.split_name = 'training'
        elif split == 'dev':
            self.split_name = 'dev'
        elif split == 'test':
            self.split_name = 'test'

        if task == 3:
            label_file = os.path.join(
                self.data_root, '{}_set_task3'.format(self.split_name),
                '{}_set_task3.txt'.format(self.split_name))
            self.class_list = read_classes('techniques_list_task3.txt')
        elif task == 1:
            label_file = os.path.join(
                self.data_root, '{}_set_task1.txt'.format(self.split_name))
            self.class_list = read_classes('techniques_list_task1-2.txt')

        with open(label_file, 'r', encoding='utf8') as f:
            self.targets = json.load(f)
        for t in self.targets:
            t['path'] = os.path.join(self.data_root,
                                     '{}_set_task3'.format(self.split_name))

        if task == 3:
            label_file_dev = os.path.join(self.data_root,
                                          'dev_set_task3_labeled',
                                          'dev_set_task3_labeled.txt')
        elif task == 1:
            label_file_dev = os.path.join(self.data_root, 'dev_set_task1.txt')
        if os.path.isfile(label_file_dev) and self.split_name == 'training':
            with open(label_file_dev, 'r', encoding='utf8') as f:
                targets = json.load(f)
                for t in targets:
                    t['path'] = os.path.join(self.data_root,
                                             'dev_set_task3_labeled')
                self.targets.extend(targets)

        # filter targets using the ids
        if split == 'train' or split == 'val':
            self.targets = [t for t in self.targets if t['id'] in ids]
        print('ok')
Exemplo n.º 2
0
    pred_file = args.pred_file_path
    gold_file = args.gold_file_path

    if args.log_to_file:
        output_log_file = pred_file + ".log"
        logger.info("Logging execution to file " + output_log_file)
        fileLogger = logging.FileHandler(output_log_file)
        fileLogger.setLevel(logging.DEBUG)
        fileLogger.setFormatter(formatter)
        logger.addHandler(fileLogger)
        logger.setLevel(logging.DEBUG)  #

    if not os.path.exists(args.classes_file_path):
        logger.errors("File doesnt exists: {}".format(classes_file_path))
        raise ValueError("File doesnt exists: {}".format(classes_file_path))
    CLASSES = read_classes(args.classes_file_path)

    if args.log_to_file:
        logger.info('Reading gold file')
    else:
        logger.info("Reading gold predictions from file {}".format(
            args.gold_file_path))
    if args.log_to_file:
        logger.info('Reading predictions file')
    else:
        logger.info('Reading predictions file {}'.format(args.pred_file_path))

    if validate_files(pred_file, gold_file, CLASSES):
        logger.info('Prediction file format is correct')
        macro_f1, micro_f1 = evaluate(pred_file, gold_file, CLASSES)
        logger.info("macro-F1={:.5f}\tmicro-F1={:.5f}".format(
Exemplo n.º 3
0
def train(opt, config, val_fold=0):
    # torch.cuda.set_enabled_lms(True)
    # if (torch.cuda.get_enabled_lms()):
    #     torch.cuda.set_limit_lms(11000 * 1024 * 1024)
    #     print('[LMS=On limit=' + str(torch.cuda.get_limit_lms()) + ']')

    if 'task' not in config['dataset']:
        config['dataset']['task'] = 3 # for back compatibility
        print('Manually assigning: task 3')

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger = SummaryWriter(log_dir=opt.logger_name, comment='')
    experiment_path = tb_logger.get_logdir()

    # Dump configuration to experiment path
    copyfile(opt.config, os.path.join(experiment_path, 'config.json'))

    # Load Vocabulary Wrapper

    # Load data loaders
    test_transforms = T.Compose([T.Resize(256),
                    T.CenterCrop(224),
                    T.ToTensor(),
                    T.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])])
    train_transforms = T.Compose([T.Resize(256),
                    T.RandomCrop(224),
                    T.ToTensor(),
                    T.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])])

    train_dataset = SemEvalDataset(config, split='train', transforms=train_transforms, val_fold=val_fold)
    val_dataset = SemEvalDataset(config, split='val', transforms=test_transforms, val_fold=val_fold)

    id_intersection = set([x['id'] for x in train_dataset.targets]).intersection([x['id'] for x in val_dataset.targets])
    assert len(id_intersection) == 0

    if config['dataset']['task'] == 3:
        classes = read_classes('techniques_list_task3.txt')
    elif config['dataset']['task'] == 1:
        classes = read_classes('techniques_list_task1-2.txt')

    collate_fn = Collate(config, classes)
    if 'balanced-sampling' in config['training'] and config['training']['balanced-sampling']:
        classes_ids = [[train_dataset.class_list.index(x) for x in info['labels']] for info in train_dataset.targets]
        labels = np.zeros((len(classes_ids), len(train_dataset.class_list)))
        for l, c in zip(labels, classes_ids):
            l[c] = 1
        sampler = MultilabelBalancedRandomSampler(labels)
    else:
        sampler = None

    train_dataloader = DataLoader(train_dataset, batch_size=config['training']['bs'], shuffle=True if sampler is None else False, num_workers=opt.workers, collate_fn=collate_fn, sampler=sampler)
    val_dataloader = DataLoader(val_dataset, batch_size=config['training']['bs'], shuffle=False,
                                  num_workers=opt.workers, collate_fn=collate_fn)

    # Construct the model
    model = MemeMultiLabelClassifier(config, labels=classes)
    if torch.cuda.is_available() and not (opt.resume or opt.load_model):
        model.cuda()

    # Construct the optimizer
    if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune']:
        optimizer = torch.optim.Adam([p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n], lr=config['training']['lr'])
    else:
        if config['dataset']['task'] == 3:
            optimizer = torch.optim.Adam([
                {'params': [p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n]},
                {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']},
                {'params': model.visual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}]
                , lr=config['training']['lr'])
        elif config['dataset']['task'] == 1:
            optimizer = torch.optim.Adam([
                {'params': [p for n, p in model.named_parameters() if
                            'textual_module' not in n and 'visual_module' not in n]},
                {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}]
                , lr=config['training']['lr'])
    # LR scheduler
    scheduler_name = config['training']['scheduler']
    if scheduler_name == 'steplr':
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, gamma=config['training']['gamma'], milestones=config['training']['milestones'])
    elif scheduler_name is None:
        scheduler = None
    else:
        raise ValueError('{} scheduler is not available'.format(scheduler_name))


    # # optionally resume from a checkpoint
    start_epoch = 0
    # if opt.resume or opt.load_model:
    #     filename = opt.resume if opt.resume else opt.load_model
    #     if os.path.isfile(filename):
    #         print("=> loading checkpoint '{}'".format(filename))
    #         checkpoint = torch.load(filename, map_location='cpu')
    #         model.load_state_dict(checkpoint['model'], strict=False)
    #         if torch.cuda.is_available():
    #             model.cuda()
    #         if opt.resume:
    #             start_epoch = checkpoint['epoch']
    #             # best_rsum = checkpoint['best_rsum']
    #             optimizer.load_state_dict(checkpoint['optimizer'])
    #             if checkpoint['scheduler'] is not None and not opt.reinitialize_scheduler:
    #                 scheduler.load_state_dict(checkpoint['scheduler'])
    #             # Eiters is used to show logs as the continuation of another
    #             # training
    #             model.Eiters = checkpoint['Eiters']
    #             print("=> loaded checkpoint '{}' (epoch {})"
    #                   .format(opt.resume, start_epoch))
    #         else:
    #             print("=> loaded only model from checkpoint '{}'"
    #                   .format(opt.load_model))
    #     else:
    #         print("=> no checkpoint found at '{}'".format(opt.resume))

    model.train()

    # Train loop
    mean_loss = 0
    progress_bar = tqdm.trange(start_epoch, opt.num_epochs)
    progress_bar.set_description('Train')
    best_f1 = 0.0
    for epoch in progress_bar:
        for it, (image, text, text_len, labels, ids) in enumerate(train_dataloader):
            global_iteration = epoch * len(train_dataloader) + it

            if torch.cuda.is_available():
                image = image.cuda() if image is not None else None
                text = text.cuda()
                labels = labels.cuda()

            # forward the model
            optimizer.zero_grad()

            loss = model(image, text, text_len, labels)
            loss.backward()
            optimizer.step()
            mean_loss += loss.item()

            if global_iteration % opt.log_step == 0:
                mean_loss /= opt.log_step
                progress_bar.set_postfix(dict(loss='{:.2}'.format(mean_loss)))
                mean_loss = 0

            tb_logger.add_scalar("Training/Epoch", epoch, global_iteration)
            tb_logger.add_scalar("Training/Loss", loss.item(), global_iteration)
            tb_logger.add_scalar("Training/Learning_Rate", optimizer.param_groups[0]['lr'], global_iteration)

            if global_iteration % opt.val_step == 0:
                # validate (using different thresholds)
                metrics = validate(val_dataloader, model, classes, thresholds=[0.3, 0.5, 0.8])
                tb_logger.add_scalars("Validation/F1", metrics, global_iteration)
                print(metrics)
                # progress_bar.set_postfix(dict(macroF1='{:.2}'.format(metrics['macroF1_thr=0.5']), microF1='{:.2}'.format(metrics['microF1_thr=0.5'])))

                # save best model
                if metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3'] > best_f1:
                    print('Saving best model...')
                    checkpoint = {
                        'cfg': config,
                        'epoch': epoch,
                        'model': model.joint_processing_module.state_dict() if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune'] else model.state_dict()}
                        # 'optimizer': optimizer.state_dict(),
                        # 'scheduler': scheduler.state_dict()}
                    latest = os.path.join(experiment_path, 'model_best_fold{}.pt'.format(val_fold))
                    torch.save(checkpoint, latest)
                    best_f1 = metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3']

        scheduler.step()
Exemplo n.º 4
0
def main(opt):
    checkpoint = torch.load(opt.checkpoint, map_location='cpu')
    cfg = checkpoint['cfg']
    if 'task' not in cfg['dataset']:
        cfg['dataset']['task'] = 3  # for back compatibility
        print('Manually assigning: task 3')

    if cfg['dataset']['task'] == 3:
        classes = read_classes('techniques_list_task3.txt')
    elif cfg['dataset']['task'] == 1:
        classes = read_classes('techniques_list_task1-2.txt')

    if opt.ensemble or opt.cross_validation:
        checkpoints_folder = os.path.split(opt.checkpoint)[0]
        checkpoints_files = [
            os.path.join(checkpoints_folder, f)
            for f in os.listdir(checkpoints_folder) if '.pt' in f
        ]
    else:
        checkpoints_files = [opt.checkpoint]

    ensemble_models = []
    for chkp in checkpoints_files:
        model = models.MemeMultiLabelClassifier(cfg, classes)
        checkpoint = torch.load(chkp, map_location='cpu')
        # Load weights to resume from
        if not cfg['text-model']['fine-tune'] and not cfg['image-model'][
                'fine-tune']:
            # the visual and textual modules are already fine
            model.joint_processing_module.load_state_dict(checkpoint['model'])
        else:
            model.load_state_dict(checkpoint['model'])
        model.cuda().eval()
        ensemble_models.append(model)

    # Load data loaders
    test_transforms = T.Compose([
        T.Resize(256),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    collate_fn = Collate(cfg, classes)

    if opt.cross_validation:
        datasets = [
            SemEvalDataset(cfg,
                           split='val',
                           transforms=test_transforms,
                           val_fold=fold)
            for fold in range(len(checkpoints_files))
        ]
        dataloaders = [
            DataLoader(dataset,
                       batch_size=8,
                       shuffle=False,
                       num_workers=2,
                       collate_fn=collate_fn) for dataset in datasets
        ]
    else:
        if opt.validate:
            dataset = SemEvalDataset(cfg,
                                     split='val',
                                     transforms=test_transforms,
                                     val_fold=opt.val_fold)
        elif opt.test:
            dataset = SemEvalDataset(cfg,
                                     split='test',
                                     transforms=test_transforms,
                                     val_fold=opt.val_fold)
        else:
            dataset = SemEvalDataset(cfg,
                                     split='dev',
                                     transforms=test_transforms)

        dataloader = DataLoader(dataset,
                                batch_size=8,
                                shuffle=False,
                                num_workers=2,
                                collate_fn=collate_fn)
        dataloaders = [dataloader]

    resumed_logdir, resumed_filename = os.path.split(opt.checkpoint)
    del checkpoint  # current, saved
    print(
        'Model {} resumed from {}, saving results on this directory...'.format(
            resumed_filename, resumed_logdir))

    predictions = {} if opt.cross_validation else []
    metrics = {}
    thr = opt.threshold
    if opt.cross_validation:
        for idx, (dataloader, model) in enumerate(
                tqdm.tqdm(zip(dataloaders, ensemble_models))):
            for it, (image, text, text_len, labels,
                     ids) in enumerate(tqdm.tqdm(dataloader)):
                if torch.cuda.is_available():
                    image = image.cuda() if image is not None else None
                    text = text.cuda()
                    # labels = labels.cuda()

                # cross-validation
                with torch.no_grad():
                    pred_probs = model(image,
                                       text,
                                       text_len,
                                       return_probs=True)
                    valid_pred = pred_probs > thr
                    pred_classes = id_to_classes(valid_pred, classes)

                    for id, labels in zip(
                            ids, pred_classes
                    ):  # loop over every element of the batch
                        if idx not in predictions:
                            predictions[idx] = []
                        predictions[idx].append({'id': id, 'labels': labels})
    else:
        for it, (image, text, text_len, labels,
                 ids) in enumerate(tqdm.tqdm(dataloader)):
            if torch.cuda.is_available():
                image = image.cuda() if image is not None else None
                text = text.cuda()
                # labels = labels.cuda()

            ensemble_predictions = []
            with torch.no_grad():
                for model in ensemble_models:
                    pred_probs = model(image,
                                       text,
                                       text_len,
                                       return_probs=True)
                    ensemble_predictions.append(pred_probs)
                prob_ensemble = torch.stack(ensemble_predictions,
                                            dim=1).mean(dim=1)
                class_ensemble = prob_ensemble > thr
                pred_classes = id_to_classes(class_ensemble, classes)

            for id, labels in zip(
                    ids, pred_classes):  # loop over every element of the batch
                predictions.append({'id': id, 'labels': labels})

    if opt.cross_validation:
        mean_macro_f1 = 0
        mean_micro_f1 = 0
        for k in range(len(predictions)):
            dataloader = dataloaders[k]
            preds = predictions[k]
            macro_f1, micro_f1 = evaluate(preds, dataloader.dataset.targets,
                                          classes)
            mean_macro_f1 += macro_f1
            mean_micro_f1 += micro_f1
        mean_micro_f1 /= len(predictions)
        mean_macro_f1 /= len(predictions)
        out_string = 'Mean-MacroF1: {}\nMean-MicroF1: {}'.format(
            mean_macro_f1, mean_micro_f1)
        out_file = os.path.join(resumed_logdir, 'cross_validation_results.log')
        with open(out_file, 'w') as f:
            f.write(out_string)
        print(out_string)

    elif opt.validate:
        macro_f1, micro_f1 = evaluate(predictions, dataloader.dataset.targets,
                                      classes)
        print('MacroF1: {}\nMicroF1: {}'.format(macro_f1, micro_f1))
    else:
        # dump predictions on json file
        out_json = os.path.join(resumed_logdir,
                                'predictions_thr{}.json'.format(thr))
        with open(out_json, 'w') as f:
            json.dump(predictions, f)

        # cross check
        if not check_format_task1_task3(out_json, CLASSES=classes):
            print('Saved file has incorrect format! Retry...')
        print('Detection dumped on {}'.format(out_json))
        print('Num memes: {}'.format(len(predictions)))

    print('DONE!')