예제 #1
0
 def test_concat_two_non_singletons(self):
     result = ConcatDataset([[0, 1, 2, 3, 4],
                             [5, 6, 7, 8, 9]])
     self.assertEqual(10, len(result))
     self.assertEqual(0, result[0])
     self.assertEqual(5, result[5])
예제 #2
0
    return BenchmarkTasksets(train_tasks, valid_tasks, test_tasks)


if __name__ == '__main__':
    tasks = get_few_shot_tasksets(dataset='cifar-fs')
    tasks = get_normal_tasksets(dataset='cifar-fs')
    # tasks = get_normal_tasksets(dataset='cifar-fc100')
    # batch = tasks.train.sample()
    # x, y = batch
    # print(x.size())
    # print(y.size())
    # print(y)
    # x, y = tasks.train[0]
    # print(x.size())
    # print(y.size())
    from torch.utils.data import ConcatDataset

    import torch.utils.data
    dataset = ConcatDataset([tasks[1], tasks[0]])
    # print(len(tasks[0]))
    # print(len(tasks[1]))
    # print(len(dataset))
    # loader = torch.utils.data.DataLoader(dataset, batch_size=100)
    # for x, y in loader:
    #     print(x.size(), y.size())
    dset = l2l.data.MetaDataset(dataset)
    tsk = l2l.data.TaskDataset(dset)
    batch = tsk.sample()
    print(batch)
예제 #3
0
        #x, y, d, numPedsList, PedsList, target_ids

        return seq_data, seq_num_persons_list, seq_persons_list, folder_name

    def __len__(self):
        # Returns sequence length
        return len(self.person_to_frames)


# Test Block
from os import listdir
from os.path import isfile, join
path = '../data/dataloader/'
files_list = [f for f in listdir(path) if isfile(join(path, f))]

all_datasets = ConcatDataset([PedTrajectoryDataset(join(path, file)) for file in files_list])

train_loader = DataLoader(all_datasets, batch_size=2, shuffle=False, num_workers=0, pin_memory=False, collate_fn=lambda x: x)

#print(len(train_loader.dataset.datasets))

for i, tuple in enumerate(train_loader):
    print(i)
    print(tuple)
    print('************************')

print("reached")

'''
d = PedTrajectoryDataset('../data/train/overfit/x.txt')
batch_size = 1
예제 #4
0
def kitti_zhou_train(resize_height, resize_width, crop_height, crop_width,
                     batch_size, num_workers):
    """A loader that loads image sequences for depth training from the
    kitti training set.
    This loader returns sequences from the left camera, as well as from the right camera.
    """

    transforms_common = [
        tf.RandomHorizontalFlip(),
        tf.CreateScaledImage(),
        tf.Resize((resize_height, resize_width),
                  image_types=('color', 'depth', 'camera_intrinsics', 'K')),
        tf.ConvertDepth(),
        tf.CreateColoraug(new_element=True),
        tf.ColorJitter(brightness=0.2,
                       contrast=0.2,
                       saturation=0.2,
                       hue=0.1,
                       gamma=0.0,
                       fraction=0.5),
        tf.RemoveOriginals(),
        tf.ToTensor(),
        tf.NormalizeZeroMean(),
        tf.AddKeyValue('domain', 'kitti_zhou_train_depth'),
        tf.AddKeyValue('purposes', ('depth', 'domain')),
    ]

    dataset_name = 'kitti'

    cfg_common = {
        'dataset': dataset_name,
        'trainvaltest_split': 'train',
        'video_mode': 'video',
        'stereo_mode': 'mono',
        'split': 'zhou_split',
        'video_frames': (0, -1, 1),
        'disable_const_items': False
    }

    cfg_left = {'keys_to_load': ('color', ), 'keys_to_video': ('color', )}

    cfg_right = {
        'keys_to_load': ('color_right', ),
        'keys_to_video': ('color_right', )
    }

    dataset_left = StandardDataset(data_transforms=transforms_common,
                                   **cfg_left,
                                   **cfg_common)

    dataset_right = StandardDataset(data_transforms=[tf.ExchangeStereo()] +
                                    transforms_common,
                                    **cfg_right,
                                    **cfg_common)

    dataset = ConcatDataset((dataset_left, dataset_right))

    loader = DataLoader(dataset,
                        batch_size,
                        True,
                        num_workers=num_workers,
                        pin_memory=True,
                        drop_last=True)

    print(
        f"  - Can use {len(dataset)} images from the kitti (zhou_split) train split for depth training",
        flush=True)

    return loader
예제 #5
0
def train(vocabs, char_vocab, tag_vocab, train_sets, dev_sets, test_sets,
          unlabeled_sets):
    """
    train_sets, dev_sets, test_sets: dict[lang] -> AmazonDataset
    For unlabeled langs, no train_sets are available
    """
    # dataset loaders
    train_loaders, unlabeled_loaders = {}, {}
    train_iters, unlabeled_iters, d_unlabeled_iters = {}, {}, {}
    dev_loaders, test_loaders = {}, {}
    my_collate = utils.sorted_collate if opt.model == 'lstm' else utils.unsorted_collate
    for lang in opt.langs:
        train_loaders[lang] = DataLoader(train_sets[lang],
                                         opt.batch_size,
                                         shuffle=True,
                                         collate_fn=my_collate)
        train_iters[lang] = iter(train_loaders[lang])
    for lang in opt.dev_langs:
        dev_loaders[lang] = DataLoader(dev_sets[lang],
                                       opt.batch_size,
                                       shuffle=False,
                                       collate_fn=my_collate)
        test_loaders[lang] = DataLoader(test_sets[lang],
                                        opt.batch_size,
                                        shuffle=False,
                                        collate_fn=my_collate)
    for lang in opt.all_langs:
        if lang in opt.unlabeled_langs:
            uset = unlabeled_sets[lang]
        else:
            # for labeled langs, consider which data to use as unlabeled set
            if opt.unlabeled_data == 'both':
                uset = ConcatDataset([train_sets[lang], unlabeled_sets[lang]])
            elif opt.unlabeled_data == 'unlabeled':
                uset = unlabeled_sets[lang]
            elif opt.unlabeled_data == 'train':
                uset = train_sets[lang]
            else:
                raise Exception(
                    f'Unknown options for the unlabeled data usage: {opt.unlabeled_data}'
                )
        unlabeled_loaders[lang] = DataLoader(uset,
                                             opt.batch_size,
                                             shuffle=True,
                                             collate_fn=my_collate)
        unlabeled_iters[lang] = iter(unlabeled_loaders[lang])
        d_unlabeled_iters[lang] = iter(unlabeled_loaders[lang])

    # embeddings
    emb = MultiLangWordEmb(vocabs, char_vocab, opt.use_wordemb,
                           opt.use_charemb).to(opt.device)
    # models
    F_s = None
    F_p = None
    C, D = None, None
    num_experts = len(opt.langs) + 1 if opt.expert_sp else len(opt.langs)
    if opt.model.lower() == 'lstm':
        if opt.shared_hidden_size > 0:
            F_s = LSTMFeatureExtractor(opt.total_emb_size, opt.F_layers,
                                       opt.shared_hidden_size,
                                       opt.word_dropout, opt.dropout,
                                       opt.bdrnn)
        if opt.private_hidden_size > 0:
            if not opt.concat_sp:
                assert opt.shared_hidden_size == opt.private_hidden_size, "shared dim != private dim when using add_sp!"
            F_p = nn.Sequential(
                LSTMFeatureExtractor(opt.total_emb_size, opt.F_layers,
                                     opt.private_hidden_size, opt.word_dropout,
                                     opt.dropout, opt.bdrnn),
                MixtureOfExperts(opt.MoE_layers, opt.private_hidden_size,
                                 len(opt.langs), opt.private_hidden_size,
                                 opt.private_hidden_size, opt.dropout,
                                 opt.MoE_bn, False))
    else:
        raise Exception(f'Unknown model architecture {opt.model}')

    if opt.C_MoE:
        C = SpMixtureOfExperts(
            opt.C_layers, opt.shared_hidden_size, opt.private_hidden_size,
            opt.concat_sp,
            num_experts, opt.shared_hidden_size + opt.private_hidden_size,
            len(tag_vocab), opt.mlp_dropout, opt.C_bn)
    else:
        C = SpMlpTagger(opt.C_layers, opt.shared_hidden_size,
                        opt.private_hidden_size, opt.concat_sp,
                        opt.shared_hidden_size + opt.private_hidden_size,
                        len(tag_vocab), opt.mlp_dropout, opt.C_bn)
    if opt.shared_hidden_size > 0 and opt.n_critic > 0:
        if opt.D_model.lower() == 'lstm':
            d_args = {
                'num_layers': opt.D_lstm_layers,
                'input_size': opt.shared_hidden_size,
                'hidden_size': opt.shared_hidden_size,
                'word_dropout': opt.D_word_dropout,
                'dropout': opt.D_dropout,
                'bdrnn': opt.D_bdrnn,
                'attn_type': opt.D_attn
            }
        elif opt.D_model.lower() == 'cnn':
            d_args = {
                'num_layers': 1,
                'input_size': opt.shared_hidden_size,
                'hidden_size': opt.shared_hidden_size,
                'kernel_num': opt.D_kernel_num,
                'kernel_sizes': opt.D_kernel_sizes,
                'word_dropout': opt.D_word_dropout,
                'dropout': opt.D_dropout
            }
        else:
            d_args = None

        if opt.D_model.lower() == 'mlp':
            D = MLPLanguageDiscriminator(opt.D_layers, opt.shared_hidden_size,
                                         opt.shared_hidden_size,
                                         len(opt.all_langs), opt.loss,
                                         opt.D_dropout, opt.D_bn)
        else:
            D = LanguageDiscriminator(opt.D_model, opt.D_layers,
                                      opt.shared_hidden_size,
                                      opt.shared_hidden_size,
                                      len(opt.all_langs), opt.D_dropout,
                                      opt.D_bn, d_args)

    F_s, C, D = F_s.to(opt.device) if F_s else None, C.to(
        opt.device), D.to(opt.device) if D else None
    if F_p:
        F_p = F_p.to(opt.device)
    # optimizers
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, itertools.chain(*map(list,
        [emb.parameters(), F_s.parameters() if F_s else [], \
        C.parameters(), F_p.parameters() if F_p else []]))),
        lr=opt.learning_rate,
        weight_decay=opt.weight_decay)
    if D:
        optimizerD = optim.Adam(D.parameters(),
                                lr=opt.D_learning_rate,
                                weight_decay=opt.D_weight_decay)

    # testing
    if opt.test_only:
        log.info(f'Loading model from {opt.model_save_file}...')
        if F_s:
            F_s.load_state_dict(
                torch.load(os.path.join(opt.model_save_file, f'netF_s.pth')))
        for lang in opt.all_langs:
            F_p.load_state_dict(
                torch.load(os.path.join(opt.model_save_file, f'net_F_p.pth')))
        C.load_state_dict(
            torch.load(os.path.join(opt.model_save_file, f'netC.pth')))
        if D:
            D.load_state_dict(
                torch.load(os.path.join(opt.model_save_file, f'netD.pth')))

        log.info('Evaluating validation sets:')
        acc = {}
        log.info(dev_loaders)
        log.info(vocabs)
        for lang in opt.all_langs:
            acc[lang] = evaluate(f'{lang}_dev', dev_loaders[lang],
                                 vocabs[lang], tag_vocab, emb, lang, F_s, F_p,
                                 C)
        avg_acc = sum([acc[d] for d in opt.dev_langs]) / len(opt.dev_langs)
        log.info(f'Average validation accuracy: {avg_acc}')
        log.info('Evaluating test sets:')
        test_acc = {}
        for lang in opt.all_langs:
            test_acc[lang] = evaluate(f'{lang}_test', test_loaders[lang],
                                      vocabs[lang], tag_vocab, emb, lang, F_s,
                                      F_p, C)
        avg_test_acc = sum([test_acc[d]
                            for d in opt.dev_langs]) / len(opt.dev_langs)
        log.info(f'Average test accuracy: {avg_test_acc}')
        return {'valid': acc, 'test': test_acc}

    # training
    best_acc, best_avg_acc = defaultdict(float), 0.0
    epochs_since_decay = 0
    # lambda scheduling
    if opt.lambd > 0 and opt.lambd_schedule:
        opt.lambd_orig = opt.lambd
    num_iter = int(utils.gmean([len(train_loaders[l]) for l in opt.langs]))
    # adapt max_epoch
    if opt.max_epoch > 0 and num_iter * opt.max_epoch < 15000:
        opt.max_epoch = 15000 // num_iter
        log.info(f"Setting max_epoch to {opt.max_epoch}")
    for epoch in range(opt.max_epoch):
        emb.train()
        if F_s:
            F_s.train()
        C.train()
        if D:
            D.train()
        if F_p:
            F_p.train()

        # lambda scheduling
        if hasattr(opt, 'lambd_orig') and opt.lambd_schedule:
            if epoch == 0:
                opt.lambd = opt.lambd_orig
            elif epoch == 5:
                opt.lambd = 10 * opt.lambd_orig
            elif epoch == 15:
                opt.lambd = 100 * opt.lambd_orig
            log.info(f'Scheduling lambda = {opt.lambd}')

        # training accuracy
        correct, total = defaultdict(int), defaultdict(int)
        gate_correct = defaultdict(int)
        c_gate_correct = defaultdict(int)
        # D accuracy
        d_correct, d_total = 0, 0
        for i in tqdm(range(num_iter), ascii=True):
            # D iterations
            if opt.shared_hidden_size > 0:
                utils.freeze_net(emb)
                utils.freeze_net(F_s)
                utils.freeze_net(F_p)
                utils.freeze_net(C)
                utils.unfreeze_net(D)
                # WGAN n_critic trick since D trains slower
                n_critic = opt.n_critic
                if opt.wgan_trick:
                    if opt.n_critic > 0 and ((epoch == 0 and i < 25)
                                             or i % 500 == 0):
                        n_critic = 100

                for _ in range(n_critic):
                    D.zero_grad()
                    loss_d = {}
                    lang_features = {}
                    # train on both labeled and unlabeled langs
                    for lang in opt.all_langs:
                        # targets not used
                        d_inputs, _ = utils.endless_get_next_batch(
                            unlabeled_loaders, d_unlabeled_iters, lang)
                        d_inputs, d_lengths, mask, d_chars, d_char_lengths = d_inputs
                        d_embeds = emb(lang, d_inputs, d_chars, d_char_lengths)
                        shared_feat = F_s((d_embeds, d_lengths))
                        if opt.grad_penalty != 'none':
                            lang_features[lang] = shared_feat.detach()
                        if opt.D_model.lower() == 'mlp':
                            d_outputs = D(shared_feat)
                            # if token-level D, we can reuse the gate label generator
                            d_targets = utils.get_gate_label(d_outputs,
                                                             lang,
                                                             mask,
                                                             False,
                                                             all_langs=True)
                            d_total += torch.sum(d_lengths).item()
                        else:
                            d_outputs = D((shared_feat, d_lengths))
                            d_targets = utils.get_lang_label(
                                opt.loss, lang, len(d_lengths))
                            d_total += len(d_lengths)
                        # D accuracy
                        _, pred = torch.max(d_outputs, -1)
                        # d_total += len(d_lengths)
                        d_correct += (pred == d_targets).sum().item()
                        l_d = functional.nll_loss(d_outputs.view(
                            -1, D.num_langs),
                                                  d_targets.view(-1),
                                                  ignore_index=-1)
                        l_d.backward()
                        loss_d[lang] = l_d.item()
                    # gradient penalty
                    if opt.grad_penalty != 'none':
                        gp = utils.calc_gradient_penalty(
                            D,
                            lang_features,
                            onesided=opt.onesided_gp,
                            interpolate=(opt.grad_penalty == 'wgan'))
                        gp.backward()
                    optimizerD.step()

            # F&C iteration
            utils.unfreeze_net(emb)
            if opt.use_wordemb and opt.fix_emb:
                for lang in emb.langs:
                    emb.wordembs[lang].weight.requires_grad = False
            if opt.use_charemb and opt.fix_charemb:
                emb.charemb.weight.requires_grad = False
            utils.unfreeze_net(F_s)
            utils.unfreeze_net(F_p)
            utils.unfreeze_net(C)
            utils.freeze_net(D)
            emb.zero_grad()
            if F_s:
                F_s.zero_grad()
            if F_p:
                F_p.zero_grad()
            C.zero_grad()
            # optimizer.zero_grad()
            for lang in opt.langs:
                inputs, targets = utils.endless_get_next_batch(
                    train_loaders, train_iters, lang)
                inputs, lengths, mask, chars, char_lengths = inputs
                bs, seq_len = inputs.size()
                embeds = emb(lang, inputs, chars, char_lengths)
                shared_feat, private_feat = None, None
                if opt.shared_hidden_size > 0:
                    shared_feat = F_s((embeds, lengths))
                if opt.private_hidden_size > 0:
                    private_feat, gate_outputs = F_p((embeds, lengths))
                if opt.C_MoE:
                    c_outputs, c_gate_outputs = C((shared_feat, private_feat))
                else:
                    c_outputs = C((shared_feat, private_feat))
                # targets are padded with -1
                l_c = functional.nll_loss(c_outputs.view(bs * seq_len, -1),
                                          targets.view(-1),
                                          ignore_index=-1)
                # gate loss
                if F_p:
                    gate_targets = utils.get_gate_label(
                        gate_outputs, lang, mask, False)
                    l_gate = functional.cross_entropy(gate_outputs.view(
                        bs * seq_len, -1),
                                                      gate_targets.view(-1),
                                                      ignore_index=-1)
                    l_c += opt.gate_loss_weight * l_gate
                    _, gate_pred = torch.max(
                        gate_outputs.view(bs * seq_len, -1), -1)
                    gate_correct[lang] += (
                        gate_pred == gate_targets.view(-1)).sum().item()
                if opt.C_MoE and opt.C_gate_loss_weight > 0:
                    c_gate_targets = utils.get_gate_label(
                        c_gate_outputs, lang, mask, opt.expert_sp)
                    _, c_gate_pred = torch.max(
                        c_gate_outputs.view(bs * seq_len, -1), -1)
                    if opt.expert_sp:
                        l_c_gate = functional.binary_cross_entropy_with_logits(
                            mask.unsqueeze(-1) * c_gate_outputs,
                            c_gate_targets)
                        c_gate_correct[lang] += torch.index_select(
                            c_gate_targets.view(bs * seq_len, -1), -1,
                            c_gate_pred.view(bs * seq_len)).sum().item()
                    else:
                        l_c_gate = functional.cross_entropy(
                            c_gate_outputs.view(bs * seq_len, -1),
                            c_gate_targets.view(-1),
                            ignore_index=-1)
                        c_gate_correct[lang] += (c_gate_pred == c_gate_targets.
                                                 view(-1)).sum().item()
                    l_c += opt.C_gate_loss_weight * l_c_gate
                l_c.backward()
                _, pred = torch.max(c_outputs, -1)
                total[lang] += torch.sum(lengths).item()
                correct[lang] += (pred == targets).sum().item()

            # update F with D gradients on all langs
            if D:
                for lang in opt.all_langs:
                    inputs, _ = utils.endless_get_next_batch(
                        unlabeled_loaders, unlabeled_iters, lang)
                    inputs, lengths, mask, chars, char_lengths = inputs
                    embeds = emb(lang, inputs, chars, char_lengths)
                    shared_feat = F_s((embeds, lengths))
                    # d_outputs = D((shared_feat, lengths))
                    if opt.D_model.lower() == 'mlp':
                        d_outputs = D(shared_feat)
                        # if token-level D, we can reuse the gate label generator
                        d_targets = utils.get_gate_label(d_outputs,
                                                         lang,
                                                         mask,
                                                         False,
                                                         all_langs=True)
                    else:
                        d_outputs = D((shared_feat, lengths))
                        d_targets = utils.get_lang_label(
                            opt.loss, lang, len(lengths))
                    l_d = functional.nll_loss(d_outputs.view(-1, D.num_langs),
                                              d_targets.view(-1),
                                              ignore_index=-1)
                    if opt.lambd > 0:
                        l_d *= -opt.lambd
                    l_d.backward()

            optimizer.step()

        # end of epoch
        log.info('Ending epoch {}'.format(epoch + 1))
        if d_total > 0:
            log.info('D Training Accuracy: {}%'.format(100.0 * d_correct /
                                                       d_total))
        log.info('Training accuracy:')
        log.info('\t'.join(opt.langs))
        log.info('\t'.join(
            [str(100.0 * correct[d] / total[d]) for d in opt.langs]))
        log.info('Gate accuracy:')
        log.info('\t'.join(
            [str(100.0 * gate_correct[d] / total[d]) for d in opt.langs]))
        log.info('Tagger Gate accuracy:')
        log.info('\t'.join(
            [str(100.0 * c_gate_correct[d] / total[d]) for d in opt.langs]))
        log.info('Evaluating validation sets:')
        acc = {}
        for lang in opt.dev_langs:
            acc[lang] = evaluate(f'{lang}_dev', dev_loaders[lang],
                                 vocabs[lang], tag_vocab, emb, lang, F_s, F_p,
                                 C)
        avg_acc = sum([acc[d] for d in opt.dev_langs]) / len(opt.dev_langs)
        log.info(f'Average validation accuracy: {avg_acc}')
        log.info('Evaluating test sets:')
        test_acc = {}
        for lang in opt.dev_langs:
            test_acc[lang] = evaluate(f'{lang}_test', test_loaders[lang],
                                      vocabs[lang], tag_vocab, emb, lang, F_s,
                                      F_p, C)
        avg_test_acc = sum([test_acc[d]
                            for d in opt.dev_langs]) / len(opt.dev_langs)
        log.info(f'Average test accuracy: {avg_test_acc}')

        if avg_acc > best_avg_acc:
            epochs_since_decay = 0
            log.info(f'New best average validation accuracy: {avg_acc}')
            best_acc['valid'] = acc
            best_acc['test'] = test_acc
            best_avg_acc = avg_acc
            with open(os.path.join(opt.model_save_file, 'options.pkl'),
                      'wb') as ouf:
                pickle.dump(opt, ouf)
            if F_s:
                torch.save(F_s.state_dict(),
                           '{}/netF_s.pth'.format(opt.model_save_file))
            torch.save(emb.state_dict(),
                       '{}/net_emb.pth'.format(opt.model_save_file))
            if F_p:
                torch.save(F_p.state_dict(),
                           '{}/net_F_p.pth'.format(opt.model_save_file))
            torch.save(C.state_dict(),
                       '{}/netC.pth'.format(opt.model_save_file))
            if D:
                torch.save(D.state_dict(),
                           '{}/netD.pth'.format(opt.model_save_file))
        else:
            epochs_since_decay += 1
            if opt.lr_decay < 1 and epochs_since_decay >= opt.lr_decay_epochs:
                epochs_since_decay = 0
                old_lr = optimizer.param_groups[0]['lr']
                optimizer.param_groups[0]['lr'] = old_lr * opt.lr_decay
                log.info(f'Decreasing LR to {old_lr * opt.lr_decay}')

    # end of training
    log.info(f'Best average validation accuracy: {best_avg_acc}')
    return best_acc
예제 #6
0
    def train(self, net, samples, optimizer, e):
        alpha = 2 * max(0, ((50 - e) / 50))
        criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha)

        transforms = generator.TransformationsGenerator([
            random.RandomFlipLr(),
            random.RandomAffine(image_size=101,
                                translation=lambda rs:
                                (rs.randint(-20, 20), rs.randint(-20, 20)),
                                scale=lambda rs: (rs.uniform(0.85, 1.15), 1),
                                **utils.transformations_options)
        ])

        samples_aux = list(
            set(samples).intersection(set(utils.get_aux_samples())))
        dataset_aux = datasets.ImageDataset(samples_aux, settings.train,
                                            transforms)

        dataset_pseudo = datasets.SemiSupervisedImageDataset(
            samples_test,
            settings.test,
            transforms,
            size=len(samples_test),
            test_predictions=self.test_predictions,
            momentum=0.0)

        dataset = datasets.ImageDataset(samples, settings.train, transforms)
        weight_train = len(dataset_pseudo) / len(dataset) * 2
        weight_aux = weight_train / 2
        weights = [weight_train] * len(dataset) + [weight_aux] * len(
            dataset_aux) + [1] * len(dataset_pseudo)
        dataloader = DataLoader(
            ConcatDataset([dataset, dataset_aux, dataset_pseudo]),
            num_workers=10,
            batch_size=16,
            sampler=WeightedRandomSampler(weights=weights, num_samples=3200))

        average_meter_train = meters.AverageMeter()

        with tqdm(total=len(dataloader), leave=False,
                  ascii=True) as pbar, torch.enable_grad():
            net.train()

            padding = tta.Pad((13, 14, 13, 14))

            for images, masks_targets in dataloader:
                masks_targets = masks_targets.to(gpu)
                masks_predictions = padding.transform_backward(
                    net(padding.transform_forward(images))).contiguous()

                loss = criterion(masks_predictions, masks_targets)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                average_meter_train.add('loss', loss.item())
                self.update_pbar(torch.sigmoid(masks_predictions),
                                 masks_targets, pbar, average_meter_train,
                                 'Training epoch {}'.format(e))

        train_stats = {
            'train_' + k: v
            for k, v in average_meter_train.get_all().items()
        }
        return train_stats
예제 #7
0
    set1 = subDataSet(wav_filename,
                      meta_filename,
                      crop_duration_s=crop_duration_s,
                      transform=composed)

    specific_name = "split1_ir0_ov1_2"
    wav_filename = os.path.join(wav_dir, specific_name + ".wav")
    meta_filename = os.path.join(meta_dir, specific_name + ".csv")
    set2 = subDataSet(wav_filename,
                      meta_filename,
                      crop_duration_s=crop_duration_s,
                      transform=composed)

    datasets.append(set1)
    datasets.append(set2)
    dataset = ConcatDataset(datasets)
    print(dataset[0]['specgram'].size())
    print(dataset[23]['specgram'].size())
    print(os.listdir(wav_dir).__len__())
    # end region

    # region fulltest
    wav_dir = "C:\\Users\\jgohj\\PycharmProjects\\Jon\\data\\mic_dev_test"
    meta_dir = "C:\\Users\\jgohj\\PycharmProjects\\Jon\\data\\metadata_dev"
    wav_list = os.listdir((wav_dir))
    composed = transforms.Compose([Spectrogram1(), Binarize()])
    melcomposed = transforms.Compose([MelSpectrogram(), Binarize()])
    datasets = []
    meldatasets = []

    print("Creating Full Data set...")
예제 #8
0
파일: data_silo.py 프로젝트: voxlogic/FARM
    def _get_dataset(self, filename, dicts=None):
        if not filename and not dicts:
            raise ValueError("You must either supply `filename` or `dicts`")

        # loading dicts from file (default)
        if dicts is None:
            dicts = list(self.processor.file_to_dicts(filename))
            #shuffle list of dicts here if we later want to have a random dev set splitted from train set
            if str(self.processor.train_filename) in str(filename):
                if not self.processor.dev_filename:
                    if self.processor.dev_split > 0.0:
                        random.shuffle(dicts)

        num_dicts = len(dicts)
        multiprocessing_chunk_size, num_cpus_used = calc_chunksize(
            num_dicts=num_dicts,
            max_processes=self.max_processes,
            max_chunksize=self.max_multiprocessing_chunksize,
        )

        with ExitStack() as stack:
            if self.max_processes > 1:  # use multiprocessing only when max_processes > 1
                p = stack.enter_context(mp.Pool(processes=num_cpus_used))

                logger.info(
                    f"Got ya {num_cpus_used} parallel workers to convert {num_dicts} dictionaries "
                    f"to pytorch datasets (chunksize = {multiprocessing_chunk_size})..."
                )
                log_ascii_workers(num_cpus_used, logger)

                results = p.imap(
                    partial(self._dataset_from_chunk,
                            processor=self.processor),
                    grouper(dicts, multiprocessing_chunk_size),
                    chunksize=1,
                )
            else:
                logger.info(
                    f"Multiprocessing disabled, using a single worker to convert {num_dicts}"
                    f"dictionaries to pytorch datasets.")

                results = map(
                    partial(self._dataset_from_chunk,
                            processor=self.processor),
                    grouper(dicts, num_dicts))

            datasets = []

            desc = f"Preprocessing Dataset"
            if filename:
                desc += f" {filename}"
            with tqdm(total=len(dicts), unit=' Dicts', desc=desc) as pbar:
                for dataset, tensor_names in results:
                    datasets.append(dataset)
                    # update progress bar (last step can have less dicts than actual chunk_size)
                    pbar.update(
                        min(multiprocessing_chunk_size, pbar.total - pbar.n))
            # _dataset_from_chunk can return a None in cases where downsampling has occurred
            datasets = [d for d in datasets if d]
            concat_datasets = ConcatDataset(datasets)
            return concat_datasets, tensor_names
예제 #9
0
파일: data_silo.py 프로젝트: voxlogic/FARM
    def _make_question_answering(cls,
                                 datasilo,
                                 sets=["train", "dev", "test"],
                                 n_splits=5,
                                 shuffle=True,
                                 random_state=None,
                                 n_neg_answers_per_question=1):
        """
        Create number of folds data-silo-like objects which can be used for training from the
        original data silo passed on. This function takes into account the characteristics of the
        data for question-answering-

        :param datasilo: the data silo that contains the original data
        :type datasilo: DataSilo
        :param sets: which sets to use to create the xval folds (strings)
        :type sets: list
        :param n_splits: number of folds to create
        :type n_splits: int
        :param shuffle: shuffle each class' samples before splitting
        :type shuffle: bool
        :param random_state: random state for shuffling
        :type random_state: int
        :param n_neg_answers_per_question: number of negative answers per question to include for training
        :type n_neg_answers_per_question: int
        """
        assert "id" in datasilo.tensor_names, f"Expected tensor 'id' in tensor names, found {datasilo.tensor_names}"
        assert "labels" in datasilo.tensor_names, f"Expected tensor 'labels' in tensor names, found {datasilo.tensor_names}"

        id_index = datasilo.tensor_names.index("id")
        label_index = datasilo.tensor_names.index("labels")

        sets_to_concat = []
        for setname in sets:
            if datasilo.data[setname]:
                sets_to_concat.extend(datasilo.data[setname])
        all_data = ConcatDataset(sets_to_concat)

        documents = []
        keyfunc = lambda x: x[id_index][0]
        all_data = sorted(all_data.datasets, key=keyfunc)
        for key, document in groupby(all_data, key=keyfunc):
            documents.append(list(document))

        xval_split = cls._split_for_qa(
            documents=documents,
            id_index=id_index,
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        silos = []

        for train_set, test_set in xval_split:
            # Each training set is further divided into actual train and dev set
            if datasilo.processor.dev_split > 0:
                dev_split = datasilo.processor.dev_split
                n_dev = int(np.ceil(dev_split * len(train_set)))
                assert n_dev > 0, f"dev split of {dev_split} is not large enough to split away a development set"
                n_actual_train = len(train_set) - n_dev
                actual_train_set = train_set[:n_actual_train]
                dev_set = train_set[n_actual_train:]
                ds_dev = [
                    sample for document in dev_set for sample in document
                ]
            else:
                ds_dev = None
                actual_train_set = train_set

            train_samples = []
            for doc in actual_train_set:
                keyfunc = lambda x: x[id_index][1]
                doc = sorted(doc, key=keyfunc)
                for key, question in groupby(doc, key=keyfunc):
                    # add all available answrs to train set
                    sample_list = list(question)
                    neg_answer_idx = []
                    for index, sample in enumerate(sample_list):
                        if sample[label_index][0][0] or sample[label_index][0][
                                1]:
                            train_samples.append(sample)
                        else:
                            neg_answer_idx.append(index)
                    # add random n_neg_answers_per_question samples to train set
                    if len(neg_answer_idx) <= n_neg_answers_per_question:
                        train_samples.extend(
                            [sample_list[idx] for idx in neg_answer_idx])
                    else:
                        neg_answer_idx = random.sample(
                            neg_answer_idx, n_neg_answers_per_question)
                        train_samples.extend(
                            [sample_list[idx] for idx in neg_answer_idx])

            ds_train = train_samples
            ds_test = [sample for document in test_set for sample in document]
            silos.append(
                DataSiloForCrossVal(datasilo, ds_train, ds_dev, ds_test))
        return silos
예제 #10
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--pkl_dir",
        default=None,
        type=str,
        help="The pkl data dir for training set logits.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--data_cache_name",
        default=None,
        type=str,
        help="The name of cached data",
    )
    parser.add_argument(
        "--language",
        default=None,
        type=str,
        required=True,
        help=
        "Evaluation language. Also train language if `train_language` is set to None.",
    )
    parser.add_argument("--benchmark",
                        default='xtreme',
                        type=str,
                        help="benchmark, xglue/xtreme")
    parser.add_argument(
        "--train_language",
        default=None,
        type=str,
        help="Train language if is different of the evaluation language.")
    parser.add_argument("--sample_ratio",
                        default=0.0,
                        type=float,
                        help="The training sample ratio of each language")
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    # Other parameters
    parser.add_argument("--log_dir",
                        default=None,
                        type=str,
                        help="The output log dir.")

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--gpu_id", default=None, type=str, help="GPU id")
    parser.add_argument("--filter_k", type=int, default=0)
    parser.add_argument("--filter_m", type=int, default=0)
    parser.add_argument("--first_loss_only", action='store_true')
    parser.add_argument("--use_eng_logits",
                        action='store_true',
                        help='use english soft logits for other language')

    parser.add_argument("--alpha",
                        type=float,
                        default=0,
                        help='alpha for kd loss')
    parser.add_argument("--temperature",
                        type=float,
                        default=0.1,
                        help="temprature to soft logits")

    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--hidden_dropout_prob",
        default=0.1,
        type=float,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--eval_checkpoints",
                        type=str,
                        default=None,
                        help="evaluation checkpoints")
    parser.add_argument("--eval_splits",
                        default='valid',
                        type=str,
                        help="eval splits")
    parser.add_argument("--eval_train",
                        action='store_true',
                        help="eval splits")
    parser.add_argument("--pkl_index",
                        default="0",
                        type=str,
                        help="pickle index for dumping training logits")

    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--use_all_samples_per_epoch",
                        type=boolean_string,
                        default='true',
                        help="Use all samples for per epoch training")
    parser.add_argument("--max_train_samples_per_epoch",
                        default=None,
                        type=int,
                        help="Total number of training epochs to perform.")

    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=-1,
                        help="Log every X updates steps.")
    parser.add_argument("--logging_each_epoch",
                        action="store_true",
                        help="Whether to log after each epoch.")
    parser.add_argument("--logging_steps_in_sample",
                        type=int,
                        default=-1,
                        help="log every X samples.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=-1,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=52,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.gpu_id:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    if args.pkl_dir is None:
        args.pkl_dir = args.data_dir

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    logger.info("Training/evaluation parameters %s", args)

    # preprocess args
    assert not (args.logging_steps != -1 and args.logging_steps_in_sample != -1
                ), "these two parameters can't both be setted"
    if args.logging_steps == -1 and args.logging_steps_in_sample != -1:
        total_batch_size = args.n_gpu * args.per_gpu_train_batch_size * args.gradient_accumulation_steps
        args.logging_steps = args.logging_steps_in_sample // total_batch_size

    # Set seed
    set_seed(args)

    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name](language=args.language,
                                           train_language=args.train_language,
                                           benchmark=args.benchmark)
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.filter_k > 0:  # there is cross attention layer
        config.first_loss_only = args.first_loss_only
        config.alpha = args.alpha
        config.temperature = args.temperature
        config.filter_m = args.filter_m
        config.hidden_dropout_prob = args.hidden_dropout_prob

        config.output_hidden_states = True
        config.filter_k = min(args.filter_k,
                              config.num_hidden_layers - args.filter_m)
        config.num_hidden_layers = args.filter_m

    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
        filter_m=args.filter_m,
        filter_k=args.filter_k,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
    # Training
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        if args.filter_k > 0:  # FILTER
            train_langs = [
                "en-{}".format(lang) for lang in args.language.split(',')
            ]
        else:
            train_langs = args.train_language.split(',')

        dataset_list = []
        for lang in train_langs:
            lg_train_dataset, guids = load_and_cache_examples(args,
                                                              args.task_name,
                                                              tokenizer,
                                                              lang,
                                                              split="train")
            dataset_list.append(lg_train_dataset)
        if args.filter_k > 0:
            train_dataset = AlignDataset(
                dataset_list,
                train_langs.index('en-en'),
                is_training=True,
                use_all_samples=args.use_all_samples_per_epoch)

        else:
            train_dataset = ConcatDataset(dataset_list)

        global_step, tr_loss = train(args, train_dataset, label_list, model,
                                     tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        results = {}
        if args.eval_checkpoints:
            checkpoints = [
                os.path.join(args.output_dir, ckpt)
                for ckpt in args.eval_checkpoints.split(',')
            ]
        else:
            checkpoints = [args.output_dir]
            if args.eval_all_checkpoints:
                checkpoints = list(
                    os.path.dirname(c) for c in sorted(
                        glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                                  recursive=True)))
        logging.getLogger("transformers.modeling_utils").setLevel(
            logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        best_avg, best_checkpoint = 0, None
        task_metric = "acc" if args.task_name != "rel" else "ndcg"
        for checkpoint in checkpoints:
            prefix = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""

            tokenizer = tokenizer_class.from_pretrained(
                checkpoint, do_lower_case=args.do_lower_case)

            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args,
                              model,
                              tokenizer,
                              label_list,
                              prefix=prefix,
                              splits=args.eval_splits.split(','))
            results[os.path.basename(checkpoint)] = result

            logger.info("{}\t{}".format(checkpoint, result))

            if best_avg < result["valid_avg"][task_metric]:
                best_avg = result["valid_avg"][task_metric]
                best_checkpoint = checkpoint

        with open(os.path.join(args.output_dir, "eval_logs.txt"),
                  'w') as log_writer:
            for key, val in results.items():
                log_writer.write("{}\t{}\n".format(key, json.dumps(val)))

    if args.eval_train and args.local_rank in [-1, 0]:
        if args.eval_checkpoints:
            # use the first one
            checkpoint = [
                os.path.join(args.output_dir, ckpt)
                for ckpt in args.eval_checkpoints.split(',')
            ][0]
        else:
            checkpoint = os.path.join(args.output_dir, 'checkpoint-best')

        assert os.path.exists(checkpoint)
        model = model_class.from_pretrained(checkpoint)
        model.to(args.device)
        evaluate(args,
                 model,
                 tokenizer,
                 label_list,
                 prefix="",
                 splits=['train'])

    logger.info("Task {0} finished!".format(args.task_name))
예제 #11
0
        os.makedirs(args.checkpoint_folder)
    logging.info("Prepare training datasets.")
    datasets = []
    for dataset_path in args.datasets:
        if args.dataset_type == 'voc':
            dataset = VOCDataset(dataset_path, transform=train_transform,
                                 target_transform=target_transform)
            label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt")
            store_labels(label_file, dataset.class_names)
            num_classes = len(dataset.class_names)

        else:
            raise ValueError(f"Dataset tpye {args.dataset_type} is not supported.")
        datasets.append(dataset)
    logging.info(f"Stored labels into file {label_file}.")
    train_dataset = ConcatDataset(datasets)
    logging.info("Train dataset size: {}".format(len(train_dataset)))
    train_loader = DataLoader(train_dataset, args.batch_size,
                              num_workers=args.num_workers,
                              shuffle=True)
    logging.info("Prepare Validation datasets.")
    if args.dataset_type == "voc":
        val_dataset = VOCDataset(args.validation_dataset, transform=test_transform,
                                 target_transform=target_transform, is_test=True)
    logging.info("validation dataset size: {}".format(len(val_dataset)))

    val_loader = DataLoader(val_dataset, args.batch_size,
                            num_workers=args.num_workers,
                            shuffle=False)
    logging.info("Build network.")
    net = create_net(num_classes)
예제 #12
0
def train_cl(model, train_datasets, replay_mode="none", scenario="class",classes_per_task=None,iters=2000,batch_size=32,
             generator=None, gen_iters=0, gen_loss_cbs=list(), loss_cbs=list(), eval_cbs=list(), sample_cbs=list(),
             use_exemplars=True, add_exemplars=False, eval_cbs_exemplars=list()):
    '''Train a model (with a "train_a_batch" method) on multiple tasks, with replay-strategy specified by [replay_mode].

    [model]             <nn.Module> main model to optimize across all tasks
    [train_datasets]    <list> with for each task the training <DataSet>
    [replay_mode]       <str>, choice from "generative", "exact", "current", "offline" and "none"
    [scenario]          <str>, choice from "task", "domain" and "class"
    [classes_per_task]  <int>, # of classes per task
    [iters]             <int>, # of optimization-steps (i.e., # of batches) per task
    [generator]         None or <nn.Module>, if a seperate generative model should be trained (for [gen_iters] per task)
    [*_cbs]             <list> of call-back functions to evaluate training-progress'''

    # Set model in training-mode
    model.train()

    # Use cuda?
    cuda = model._is_on_cuda()
    device = model._device()

    # Initiate possible sources for replay (no replay for 1st task)
    Exact = Generative = Current = False
    previous_model = None

    # Register starting param-values (needed for "intelligent synapses").
    if isinstance(model, ContinualLearner) and (model.si_c>0):
        for n, p in model.named_parameters():
            if p.requires_grad:
                n = n.replace('.', '__')
                model.register_buffer('{}_SI_prev_task'.format(n), p.data.clone())

    # Loop over all tasks.
    for task, train_dataset in enumerate(train_datasets, 1):

        # If offline replay-setting, create large database of all tasks so far
        if replay_mode=="offline" and (not scenario=="task"):
            train_dataset = ConcatDataset(train_datasets[:task])
        # -but if "offline"+"task"-scenario: all tasks so far included in 'exact replay' & no current batch
        if replay_mode=="offline" and scenario == "task":
            Exact = True
            previous_datasets = train_datasets

        # Add exemplars (if available) to current dataset (if requested)
        if add_exemplars and task > 1:
            # ---------- ADHOC SOLUTION: permMNIST needs transform to tensor, while splitMNIST does not ---------- #
            if len(train_datasets) > 6:
                target_transform = (lambda y, x=classes_per_task: torch.tensor(y % x)) if (
                        scenario == "domain"
                ) else (lambda y: torch.tensor(y))
            else:
                target_transform = (lambda y, x=classes_per_task: y % x) if scenario == "domain" else None
            # ---------------------------------------------------------------------------------------------------- #
            exemplar_dataset = ExemplarDataset(model.exemplar_sets, target_transform=target_transform)
            training_dataset = ConcatDataset([train_dataset, exemplar_dataset])
        else:
            training_dataset = train_dataset

        # Prepare <dicts> to store running importance estimates and param-values before update ("Synaptic Intelligence")
        if isinstance(model, ContinualLearner) and (model.si_c>0):
            W = {}
            p_old = {}
            for n, p in model.named_parameters():
                if p.requires_grad:
                    n = n.replace('.', '__')
                    W[n] = p.data.clone().zero_()
                    p_old[n] = p.data.clone()

        # Find [active_classes]
        active_classes = None  # -> for Domain-IL scenario, always all classes are active
        if scenario == "task":

            # -for Task-IL scenario, create <list> with for all tasks so far a <list> with the active classes
            active_classes = [list(range(classes_per_task * i, classes_per_task * (i + 1))) for i in range(task)]
        elif scenario == "class":

            # -for Class-IL scenario, create one <list> with active classes of all tasks so far
            active_classes = list(range(classes_per_task * task))

        # Reset state of optimizer(s) for every task (if requested)
        if model.optim_type == "adam_reset":
            model.optimizer = optim.Adam(model.optim_list, betas=(0.9, 0.999))
        if (generator is not None) and generator.optim_type == "adam_reset":
            generator.optimizer = optim.Adam(model.optim_list, betas=(0.9, 0.999))

        # Initialize # iters left on current data-loader(s)
        iters_left = iters_left_previous = 1
        if scenario == "task":
            up_to_task = task if replay_mode == "offline" else task-1
            iters_left_previous = [1]*up_to_task
            data_loader_previous = [None]*up_to_task

        # Define tqdm progress bar(s)
        progress = tqdm.tqdm(range(1, iters+1))
        if generator is not None:
            progress_gen = tqdm.tqdm(range(1, gen_iters+1))

        # Loop over all iterations
        iters_to_use = iters if (generator is None) else max(iters, gen_iters)
        for batch_index in range(1, iters_to_use+1):

            # Update # iters left on current data-loader(s) and, if needed, create new one(s)
            iters_left -= 1
            if iters_left==0:
                data_loader = iter(utils.get_data_loader(training_dataset, batch_size, cuda=cuda, drop_last=True))
                # NOTE:  [train_dataset]  is training-set of current task
                #      [training_dataset] is training-set of current task with stored exemplars added (if requested)
                iters_left = len(data_loader)
            if Exact:
                if scenario == "task":
                    up_to_task = task if replay_mode == "offline" else task-1
                    batch_size_replay = int(np.ceil(batch_size/up_to_task)) if (up_to_task>1) else batch_size
                    # -in Task-IL scenario, need separate replay for each task
                    for task_id in range(up_to_task):
                        batch_size_to_use = min(batch_size_replay, len(previous_datasets[task_id]))
                        iters_left_previous[task_id] -= 1
                        if iters_left_previous[task_id]==0:
                            data_loader_previous[task_id] = iter(utils.get_data_loader(
                                train_datasets[task_id], batch_size_to_use, cuda=cuda, drop_last=True
                            ))
                            iters_left_previous[task_id] = len(data_loader_previous[task_id])
                else:
                    iters_left_previous -= 1
                    if iters_left_previous == 0:
                        batch_size_to_use = min(batch_size, len(ConcatDataset(previous_datasets)))
                        data_loader_previous = iter(utils.get_data_loader(ConcatDataset(previous_datasets),
                                                                          batch_size_to_use, cuda=cuda, drop_last=True))
                        iters_left_previous = len(data_loader_previous)

            # -----------------Collect data------------------#

            # -----CURRENT BATCH----- #
            if replay_mode == "offline" and scenario == "task":
                x = y = scores = None
            else:
                x, y = next(data_loader)                                    # --> sample training data of current task
                y = y-classes_per_task*(task-1) if scenario == "task" else y  # --> ITL: adjust y-targets to 'active range'
                x, y = x.to(device), y.to(device)                           # --> transfer them to correct device
                # If --bce, --bce-distill & scenario=="class", calculate scores of current batch with previous model
                binary_distillation = hasattr(model, "binaryCE") and model.binaryCE and model.binaryCE_distill
                if binary_distillation and scenario == "class" and (previous_model is not None):
                    with torch.no_grad():
                        scores = previous_model(x)[:, :(classes_per_task * (task - 1))]
                else:
                    scores = None

            # -----REPLAYED BATCH----- #
            if not Exact and not Generative and not Current:
                x_ = y_ = scores_ = None   # -> if no replay

            # -->> Exact Replay <<-- ##
            if Exact:
                scores_ = None
                if scenario in ("domain", "class"):
                    # Sample replayed training data, move to correct device
                    x_, y_ = next(data_loader_previous)
                    x_ = x_.to(device)
                    y_ = y_.to(device) if (model.replay_targets == "hard") else None
                    # If required, get target scores (i.e, [scores_]         -- using previous model, with no_grad()
                    if model.replay_targets == "soft":
                        with torch.no_grad():
                            scores_ = previous_model(x_)
                        scores_ = scores_[:, :(classes_per_task*(task-1))] if scenario == "class" else scores_
                        # -> when scenario=="class", zero probabilities will be added in the [utils.loss_fn_kd]-function
                elif scenario == "task":
                    # Sample replayed training data, wrap in (cuda-)Variables and store in lists
                    x_ = list()
                    y_ = list()
                    up_to_task = task if replay_mode=="offline" else task-1
                    for task_id in range(up_to_task):
                        x_temp, y_temp = next(data_loader_previous[task_id])
                        x_.append(x_temp.to(device))
                        # -only keep [y_] if required (as otherwise unnecessary computations will be done)
                        if model.replay_targets == "hard":
                            y_temp = y_temp - (classes_per_task*task_id) #-> adjust y-targets to 'active range'
                            y_.append(y_temp.to(device))
                        else:
                            y_.append(None)
                    # If required, get target scores (i.e, [scores_]         -- using previous model
                    if (model.replay_targets == "soft") and (previous_model is not None):
                        scores_ = list()
                        for task_id in range(up_to_task):
                            with torch.no_grad():
                                scores_temp = previous_model(x_[task_id])
                            scores_temp = scores_temp[:, (classes_per_task*task_id):(classes_per_task*(task_id+1))]
                            scores_.append(scores_temp)

            # -->> Generative / Current Replay <<--##
            if Generative or Current:
                # Get replayed data (i.e., [x_]) -- either current data or use previous generator
                x_ = x if Current else previous_generator.sample(batch_size)

                # Get target scores and labels (i.e., [scores_] / [y_]) -- using previous model, with no_grad()
                # -if there are no task-specific mask, obtain all predicted scores at once
                if (not hasattr(previous_model, "mask_dict")) or (previous_model.mask_dict is None):
                    with torch.no_grad():
                        all_scores_ = previous_model(x_)
                # -depending on chosen scenario, collect relevant predicted scores (per task, if required)
                if scenario in ("domain", "class") and (
                        (not hasattr(previous_model, "mask_dict")) or (previous_model.mask_dict is None)
                ):
                    scores_ = all_scores_[:,:(classes_per_task * (task - 1))] if scenario == "class" else all_scores_
                    _, y_ = torch.max(scores_, dim=1)
                else:
                    # NOTE: it's possible to have scenario=domain with task-mask (so actually it's the Task-IL scenario)
                    # -[x_] needs to be evaluated according to each previous task, so make list with entry per task
                    scores_ = list()
                    y_ = list()
                    for task_id in range(task - 1):
                        # -if there is a task-mask (i.e., XdG is used), obtain predicted scores for each task separately
                        if hasattr(previous_model, "mask_dict") and previous_model.mask_dict is not None:
                            previous_model.apply_XdGmask(task=task_id + 1)
                            with torch.no_grad():
                                all_scores_ = previous_model(x_)
                        if scenario == "domain":
                            temp_scores_ = all_scores_
                        else:
                            temp_scores_ = all_scores_[:,
                                           (classes_per_task * task_id):(classes_per_task * (task_id + 1))]
                        _, temp_y_ = torch.max(temp_scores_, dim=1)
                        scores_.append(temp_scores_)
                        y_.append(temp_y_)

                # Only keep predicted y/scores if required (as otherwise unnecessary computations will be done)
                y_ = y_ if (model.replay_targets == "hard") else None
                scores_ = scores_ if (model.replay_targets == "soft") else None


            # ---> Train MAIN MODEL
            if batch_index <= iters:

                # Train the main model with this batch
                loss_dict = model.train_a_batch(x, y, x_=x_, y_=y_, scores=scores, scores_=scores_,
                                                active_classes=active_classes, task=task, rnt=1./task)

                # Update running parameter importance estimates in W
                if isinstance(model, ContinualLearner) and (model.si_c>0):
                    for n, p in model.named_parameters():
                        if p.requires_grad:
                            n = n.replace('.', '__')
                            if p.grad is not None:
                                W[n].add_(-p.grad*(p.detach()-p_old[n]))
                            p_old[n] = p.detach().clone()

                # Fire callbacks (for visualization of training-progress / evaluating performance after each task)
                for loss_cb in loss_cbs:
                    if loss_cb is not None:
                        loss_cb(progress, batch_index, loss_dict, task=task)
                for eval_cb in eval_cbs:
                    if eval_cb is not None:
                        eval_cb(model, batch_index, task=task)
                if model.label == "VAE":
                    for sample_cb in sample_cbs:
                        if sample_cb is not None:
                            sample_cb(model, batch_index, task=task)


            #---> Train GENERATOR
            if generator is not None and batch_index <= gen_iters:

                # Train the generator with this batch
                loss_dict = generator.train_a_batch(x, y, x_=x_, y_=y_, scores_=scores_, active_classes=active_classes,
                                                    task=task, rnt=1./task)

                # Fire callbacks on each iteration
                for loss_cb in gen_loss_cbs:
                    if loss_cb is not None:
                        loss_cb(progress_gen, batch_index, loss_dict, task=task)
                for sample_cb in sample_cbs:
                    if sample_cb is not None:
                        sample_cb(generator, batch_index, task=task)


        ##----------> UPON FINISHING EACH TASK...

        # Close progres-bar(s)
        progress.close()
        if generator is not None:
            progress_gen.close()

        # EWC: estimate Fisher Information matrix (FIM) and update term for quadratic penalty
        if isinstance(model, ContinualLearner) and (model.ewc_lambda>0):
            # -find allowed classes
            allowed_classes = list(
                range(classes_per_task*(task-1), classes_per_task*task)
            ) if scenario=="task" else (list(range(classes_per_task*task)) if scenario=="class" else None)
            # -if needed, apply correct task-specific mask
            if model.mask_dict is not None:
                model.apply_XdGmask(task=task)
            # -estimate FI-matrix
            model.estimate_fisher(training_dataset, allowed_classes=allowed_classes)

        # SI: calculate and update the normalized path integral
        if isinstance(model, ContinualLearner) and (model.si_c>0):
            model.update_omega(W, model.epsilon)

        # EXEMPLARS: update exemplar sets
        if (add_exemplars or use_exemplars) or replay_mode=="exemplars":
            exemplars_per_class = int(np.floor(model.memory_budget / (classes_per_task*task)))
            # reduce examplar-sets
            model.reduce_exemplar_sets(exemplars_per_class)
            # for each new class trained on, construct examplar-set
            new_classes = list(range(classes_per_task)) if scenario=="domain" else list(range(classes_per_task*(task-1),
                                                                                              classes_per_task*task))
            for class_id in new_classes:
                start = time.time()
                # create new dataset containing only all examples of this class
                class_dataset = SubDataset(original_dataset=train_dataset, sub_labels=[class_id])
                # based on this dataset, construct new exemplar-set for this class
                model.construct_exemplar_set(dataset=class_dataset, n=exemplars_per_class)
                print("Constructed exemplar-set for class {}: {} seconds".format(class_id, round(time.time()-start)))
            model.compute_means = True
            # evaluate this way of classifying on test set
            for eval_cb in eval_cbs_exemplars:
                if eval_cb is not None:
                    eval_cb(model, iters, task=task)

        # REPLAY: update source for replay
        previous_model = copy.deepcopy(model).eval()
        if replay_mode == 'generative':
            Generative = True
            previous_generator = copy.deepcopy(generator).eval() if generator is not None else previous_model
        elif replay_mode == 'current':
            Current = True
        elif replay_mode in ('exemplars', 'exact'):
            Exact = True
            if replay_mode == "exact":
                previous_datasets = train_datasets[:task]
            else:
                if scenario == "task":
                    previous_datasets = []
                    for task_id in range(task):
                        previous_datasets.append(
                            ExemplarDataset(
                                model.exemplar_sets[
                                (classes_per_task * task_id):(classes_per_task * (task_id + 1))],
                                target_transform=lambda y, x=classes_per_task * task_id: y + x)
                        )
                else:
                    target_transform = (lambda y, x=classes_per_task: y % x) if scenario == "domain" else None
                    previous_datasets = [
                        ExemplarDataset(model.exemplar_sets, target_transform=target_transform)]
예제 #13
0
def create_pretokenized_dataset(paths):
    datasets = [PreTokenizedFileDataset(p) for p in paths]
    dataset = ConcatDataset(datasets)
    return dataset
예제 #14
0
 def test_concat_raises_index_error(self):
     result = ConcatDataset([[0, 1, 2, 3, 4],
                             [5, 6, 7, 8, 9]])
     with self.assertRaises(IndexError):
         # this one goes to 11
         result[11]
        verbose=False,
    )

    val_dataset = CSGODataset(
        transform=transform_multichannel,
        dataset_split='val',
        verbose=False,
    )

    test_dataset = CSGODataset(
        transform=transform_multichannel,
        dataset_split='test',
        verbose=False,
    )

    train_val_dataset = ConcatDataset([train_dataset, val_dataset])

    # implicit else
    train_loader = torch.utils.data.DataLoader(
        train_val_dataset,
        batch_size=64,
        shuffle=True,
        num_workers=0,
    )

    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=64,
        shuffle=False,
        num_workers=0,
    )
예제 #16
0
    csv_file = filenames[0].split('/')[-1]

    #Créer data set pour un csv file en particulier
    # essai=DoodlesDataset(csv_file, path,nrows=select_nrows, size=size_image,skiprows=range(1,10))

    # loader=DataLoader(essai,batch_size=10)
    # for image, label in loader:
    #     print(image)
    #     t1=image[0,0,:,:]
    #     #imshow(t1)
    #     print(label)

    doodles = ConcatDataset([
        DoodlesDataset(fn.split('/')[-1],
                       path,
                       nrows=select_nrows,
                       size=size_image) for fn in filenames
    ])

    loader = DataLoader(doodles, batch_size=2, shuffle=True)

    i = 0
    for image, label in loader:
        # print(image)
        t1 = image[0, 0, :, :]
        t2 = image[1, 0, :, :]
        # imshow(t1)
        # imshow(t2)
        i += 2
        print(i)
        print(label)
예제 #17
0
def get_train_loaders(config):
    """
    Returns dictionary containing the training and validation loaders
    (torch.utils.data.DataLoader) backed by the datasets.hdf5.HDF5Dataset.

    :param config: a top level configuration object containing the 'loaders' key
    :return: dict {
        'train': <train_loader>
        'val': <val_loader>
    }
    """
    assert 'loaders' in config, 'Could not find data loaders configuration'
    loaders_config = config['loaders']

    logger = get_logger('HDF5Dataset')
    logger.info('Creating training and validation set loaders...')

    # get train and validation files
    train_paths = loaders_config['train_path']
    val_paths = loaders_config['val_path']
    assert isinstance(train_paths, list)
    assert isinstance(val_paths, list)
    # get h5 internal paths for raw and label
    raw_internal_path = loaders_config['raw_internal_path']
    label_internal_path = loaders_config['label_internal_path']
    weight_internal_path = loaders_config.get('weight_internal_path', None)
    # get train/validation patch size and stride
    train_patch = tuple(loaders_config['train_patch'])
    train_stride = tuple(loaders_config['train_stride'])
    val_patch = tuple(loaders_config['val_patch'])
    val_stride = tuple(loaders_config['val_stride'])

    # get slice_builder_cls
    slice_builder_str = loaders_config.get('slice_builder', 'SliceBuilder')
    logger.info(f'Slice builder class: {slice_builder_str}')
    slice_builder_cls = _get_slice_builder_cls(slice_builder_str)

    train_datasets = []
    for train_path in train_paths:
        try:
            logger.info(f'Loading training set from: {train_path}...')
            # create H5 backed training and validation dataset with data augmentation
            train_dataset = HDF5Dataset(
                train_path,
                train_patch,
                train_stride,
                phase='train',
                transformer_config=loaders_config['transformer'],
                raw_internal_path=raw_internal_path,
                label_internal_path=label_internal_path,
                weight_internal_path=weight_internal_path,
                slice_builder_cls=slice_builder_cls)
            train_datasets.append(train_dataset)
        except Exception:
            logger.info(f'Skipping training set: {train_path}', exc_info=True)

    val_datasets = []
    for val_path in val_paths:
        try:
            logger.info(f'Loading validation set from: {val_path}...')
            val_dataset = HDF5Dataset(
                val_path,
                val_patch,
                val_stride,
                phase='val',
                transformer_config=loaders_config['transformer'],
                raw_internal_path=raw_internal_path,
                label_internal_path=label_internal_path,
                weight_internal_path=weight_internal_path)
            val_datasets.append(val_dataset)
        except Exception:
            logger.info(f'Skipping validation set: {val_path}', exc_info=True)

    num_workers = loaders_config.get('num_workers', 1)
    logger.info(f'Number of workers for train/val datasets: {num_workers}')
    # when training with volumetric data use batch_size of 1 due to GPU memory constraints
    return {
        'train':
        DataLoader(ConcatDataset(train_datasets),
                   batch_size=1,
                   shuffle=True,
                   num_workers=num_workers),
        'val':
        DataLoader(ConcatDataset(val_datasets),
                   batch_size=1,
                   shuffle=True,
                   num_workers=num_workers)
    }
예제 #18
0
def generate_random_dataset(path,
                            nb_row_valid,
                            nb_rows_test,
                            nb_rows,
                            dict_nb_lignes,
                            size_image=224,
                            encoding_dict=None,
                            filenames=None,
                            use_acc_proportionate_sampling=False):
    '''

    Pour chaque classe dans filenames, on prend nb_rows données aléatoire dans le fichier

    :param path:
    :param nb_row_valid:
    :param nb_rows_test:
    :param nb_rows:
    :param size_image:
    :param encoding_dict:
    :param filenames:
    :return:
    '''

    if filenames == None:
        filenames = os.listdir(path)

    if use_acc_proportionate_sampling:
        if os.path.isfile("saves_obj/dict_acc_per_class_valid.pk"):
            dict_acc_class = load_object(
                "saves_obj/dict_acc_per_class_valid.pk")
        else:
            print(
                "Aucun dictionnaire d'accuracy par classe trouvé; sampling uniforme utilisé"
            )
            use_acc_proportionate_sampling = False

    nb_lignes_skip = nb_row_valid + nb_rows_test
    list_dataset = []

    dict_nb_row_used_per_class = {}

    for fn in filenames:
        n = dict_nb_lignes[fn]
        skip = list(range(1, nb_lignes_skip)) + sorted(
            random.sample(range(nb_lignes_skip, n),
                          n - nb_rows - nb_lignes_skip))

        if use_acc_proportionate_sampling:
            acc = dict_acc_class[fn[:-4]]
            new_rows = round((1.1 - acc) * nb_rows)

        else:
            new_rows = nb_rows
        dict_nb_row_used_per_class[fn] = new_rows

        data_set = DoodlesDataset(fn,
                                  path,
                                  nrows=new_rows,
                                  size=size_image,
                                  skiprows=skip,
                                  encoding_dict=encoding_dict,
                                  mode="train")
        list_dataset.append(data_set)

    doodles = ConcatDataset(list_dataset)

    print(
        "Nombre de données d'entraînement (total:{}):".format(
            sum(dict_nb_row_used_per_class.values())),
        dict_nb_row_used_per_class)

    return doodles
예제 #19
0
def get_loaders(train_paths,
                val_paths,
                raw_internal_path,
                label_internal_path,
                label_dtype,
                train_patch,
                train_stride,
                val_patch,
                val_stride,
                transformer,
                pixel_wise_weight=False,
                curriculum_learning=False,
                ignore_index=None):
    """
    Returns dictionary containing the  training and validation loaders
    (torch.utils.data.DataLoader) backed by the datasets.hdf5.HDF5Dataset

    :param train_path: path to the H5 file containing the training set
    :param val_path: path to the H5 file containing the validation set
    :param raw_internal_path:
    :param label_internal_path:
    :param label_dtype: target type of the label dataset
    :param train_patch:
    :param train_stride:
    :param val_path:
    :param val_stride:
    :param transformer:
    :return: dict {
        'train': <train_loader>
        'val': <val_loader>
    }
    """
    transformers = {
        'LabelToBoundaryTransformer': LabelToBoundaryTransformer,
        'RandomLabelToBoundaryTransformer': RandomLabelToBoundaryTransformer,
        'AnisotropicRotationTransformer': AnisotropicRotationTransformer,
        'IsotropicRotationTransformer': IsotropicRotationTransformer,
        'StandardTransformer': StandardTransformer,
        'BaseTransformer': BaseTransformer
    }

    assert transformer in transformers

    if curriculum_learning:
        slice_builder_cls = CurriculumLearningSliceBuilder
    else:
        slice_builder_cls = SliceBuilder

    train_datasets = []
    for train_path in train_paths:
        # create H5 backed training and validation dataset with data augmentation
        train_dataset = HDF5Dataset(train_path,
                                    train_patch,
                                    train_stride,
                                    phase='train',
                                    label_dtype=label_dtype,
                                    raw_internal_path=raw_internal_path,
                                    label_internal_path=label_internal_path,
                                    transformer=transformers[transformer],
                                    weighted=pixel_wise_weight,
                                    ignore_index=ignore_index,
                                    slice_builder_cls=slice_builder_cls)
        train_datasets.append(train_dataset)

    val_datasets = []
    for val_path in val_paths:
        val_dataset = HDF5Dataset(val_path,
                                  val_patch,
                                  val_stride,
                                  phase='val',
                                  label_dtype=label_dtype,
                                  raw_internal_path=raw_internal_path,
                                  label_internal_path=label_internal_path,
                                  transformer=transformers[transformer],
                                  weighted=pixel_wise_weight,
                                  ignore_index=ignore_index)
        val_datasets.append(val_dataset)

    # shuffle only if curriculum_learning scheme is not used
    return {
        'train':
        DataLoader(ConcatDataset(train_datasets),
                   batch_size=1,
                   shuffle=not curriculum_learning),
        'val':
        DataLoader(ConcatDataset(val_datasets),
                   batch_size=1,
                   shuffle=not curriculum_learning)
    }
예제 #20
0
                #transforms.RandomAffine(180, translate=(10, 10)),
                #transforms.Normalize((0.1307,), (0.3081,))
            ]))

        #false data augumentation
        tf_combinations = get_transform_combination2()
        for tf in tf_combinations:
            tf1 = []
            tf1.extend(tf)
            tf1.append(transforms.CenterCrop(IMG_SIZE))
            tf1.append(transforms.ToTensor())
            false_aug = ImageDataset(input_file_path,
                                     DATA_ROOT_DIR,
                                     0,
                                     transform=transforms.Compose(tf1))
            false_img_dataset = ConcatDataset([false_img_dataset, false_aug])

        kfold = KFold(n_splits=KFOLD)

        true_dataset_fold = kfold.split(true_img_dataset)
        false_dataset_fold = kfold.split(false_img_dataset)
        accuracy = []

        #model training and test prediction with k fold cross validation
        for fold_idx, ( (true_train_idx, true_test_idx), (false_train_idx, false_test_idx) ) in\
                enumerate( zip(true_dataset_fold, false_dataset_fold) ):

            true_train_data = [true_img_dataset[i] for i in true_train_idx]
            true_test_data = [true_img_dataset[i] for i in true_test_idx]
            false_train_data = [false_img_dataset[i] for i in false_train_idx]
            false_test_data = [false_img_dataset[i] for i in false_test_idx]
def load_data(data_root,
              dataset,
              phase,
              batch_size,
              sampler_dic=None,
              num_workers=8,
              cifar_imb_ratio=None,
              test_open=False,
              shuffle=True):

    txt = './data/%s/%s_%s.txt' % (dataset, dataset, (
        phase if phase not in ['train_plain', 'tail'] else 'train'))

    if dataset != 'iNaturalist18':
        key = 'default'
    else:
        key = 'iNaturalist18'

    rgb_mean, rgb_std = RGB_statistics[key]['mean'], RGB_statistics[key]['std']

    if phase not in ['train', 'val']:
        transform = get_data_transform('test', rgb_mean, rgb_std, key)
    else:
        transform = get_data_transform(phase, rgb_mean, rgb_std, key)

    # print('Use data transformation:', transform)
    if dataset == 'CIFAR10_LT':
        print('====> CIFAR10 Imbalance Ratio: ', cifar_imb_ratio)
        set_ = IMBALANCECIFAR10(phase,
                                imbalance_ratio=cifar_imb_ratio,
                                root=data_root)
    elif dataset == 'CIFAR100_LT':
        print('====> CIFAR100 Imbalance Ratio: ', cifar_imb_ratio)
        set_ = IMBALANCECIFAR100(phase,
                                 imbalance_ratio=cifar_imb_ratio,
                                 root=data_root)
    else:
        print('Loading data from %s' % (txt))
        set_ = LT_Dataset(data_root, txt, transform, phase)

    if phase == 'test' and test_open:
        open_txt = './data/%s/%s_open.txt' % (dataset, dataset)
        print('Testing with opensets from %s' % (open_txt))
        open_set_ = LT_Dataset('./data/%s/%s_open' % (dataset, dataset),
                               open_txt, transform)
        set_ = ConcatDataset([set_, open_set_])

    if sampler_dic and (phase == 'train' or phase == 'train_drw'):
        print('Using sampler.')
        # print('Sample %s samples per-class.' % sampler_dic['num_samples_cls'])
        print('Sampler parameters: ', sampler_dic['params'])
        return DataLoader(dataset=set_,
                          batch_size=batch_size,
                          sampler=sampler_dic['sampler'](
                              set_, **sampler_dic['params']),
                          num_workers=num_workers)
    else:
        print('No sampler.')
        print('Shuffle is %s.' % (shuffle))
        return DataLoader(dataset=set_,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          num_workers=num_workers)
예제 #22
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--device',
                        type=str,
                        default='gpu',
                        help='For cpu: \'cpu\', for gpu: \'gpu\'')
    parser.add_argument('--chunk_size',
                        type=int,
                        default=36,
                        help='chunk size(sequence length)')
    parser.add_argument('--step_size',
                        type=int,
                        default=1,
                        help='sequence split step')
    parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
    parser.add_argument('--weight_decay',
                        type=argtype.check_float,
                        default='1e-2',
                        help='weight_decay')
    parser.add_argument('--epoch',
                        type=argtype.epoch,
                        default='inf',
                        help='the number of epoch for training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='size of batches for training')
    parser.add_argument('--val_ratio',
                        type=float,
                        default=.3,
                        help='validation set ratio')
    parser.add_argument('--model_name',
                        type=str,
                        default='main_model',
                        help='model name to save')
    parser.add_argument('--transfer',
                        type=argtype.boolean,
                        default=False,
                        help='whether fine tuning or not')
    parser.add_argument('--oversample_times',
                        type=int,
                        default=30,
                        help='the times oversampling times for fine tuning')
    parser.add_argument('--patience',
                        type=int,
                        default=20,
                        help='patience for early stopping')
    parser.add_argument('--c_loss',
                        type=argtype.boolean,
                        default=True,
                        help='whether using custom loss or not')
    parser.add_argument('--predict',
                        type=argtype.boolean,
                        default=False,
                        help='predict and save csv file or not')
    parser.add_argument('--filename',
                        type=str,
                        default='submission',
                        help='csv file name to save predict result')
    parser.add_argument('--Y_list',
                        type=argtype.str_to_list,
                        default='Y12,Y15',
                        help='target Y for pre-training')
    parser.add_argument('--window_size',
                        type=int,
                        default=1,
                        help='window size for moving average')
    parser.add_argument('--attention',
                        type=argtype.boolean,
                        default=True,
                        help='select model using attention mechanism')

    args = parser.parse_args()

    data_dir = './data'

    if args.device == 'gpu':
        args.device = 'cuda'
    device = torch.device(args.device)

    chunk_size = args.chunk_size
    step_size = args.step_size
    lr = args.lr
    weight_decay = args.weight_decay
    EPOCH = args.epoch
    batch_size = args.batch_size
    val_ratio = args.val_ratio
    model_name = args.model_name
    transfer_learning = args.transfer
    times = args.oversample_times
    patience = args.patience
    c_loss = args.c_loss
    pred = args.predict
    filename = args.filename
    Y_list = args.Y_list
    window_size = args.window_size
    attention = args.attention

    params = {
        'chunk_size': chunk_size,
        'step_size': step_size,
        'learning_rate': lr,
        'weight_decay': weight_decay,
        'epoch size': EPOCH,
        'batch_size': batch_size,
        'valid_ratio': val_ratio,
        'model_name': model_name,
        'transfer_learning': transfer_learning,
        'oversample_times': times,
        'early_stopping_patience': patience,
        'c_loss': c_loss,
        'pred': pred,
        'filename': filename,
        'Y_list': Y_list,
        'window_size': window_size,
        'attention': attention
    }

    Y = ''
    for y in Y_list:
        Y += y

    model_name = f'{model_name}/{Y}'

    Dataframe = dataframe.Dataframe(data_dir=data_dir)
    input_size = len(Dataframe.feature_cols)

    if attention:
        model = regressor.Attention_Regressor(input_size).to(device)
    else:
        model = regressor.BiLSTM_Regressor().to(device)

    checkpoint = Checkpoint(model_name=model_name,
                            transfer_learning=transfer_learning)
    early_stopping = Early_stopping(patience=patience)
    vis = Custom_Visdom(model_name, transfer_learning)
    vis.print_params(params)

    if transfer_learning:

        dataset_list = []

        if attention:

            pre_df = Dataframe.get_pretrain_df()\
                    .iloc[-chunk_size+1:][Dataframe.feature_cols]
            df = Dataframe.get_y18_df()

            df = pd.concat([pre_df, df], axis=0)

        else:
            df = Dataframe.get_y18_df()

        train_dataset = datasets.CustomSequenceDataset(chunk_size=chunk_size,
                                                       df=df,
                                                       Y='Y18',
                                                       step_size=step_size,
                                                       noise=True,
                                                       times=times)

        dataset_list.append(train_dataset)

        dataset = ConcatDataset(dataset_list)

        train_loader, valid_loader = datasets.split_dataset(
            dataset=dataset,
            batch_size=batch_size,
            val_ratio=val_ratio,
            shuffle=True)

        checkpoint.load_model(model)

    else:

        dataset_list = []

        for y in Y_list:

            df = Dataframe.get_pretrain_df()
            df[y] = df[y].rolling(window=window_size, min_periods=1).mean()

            dataset = datasets.CustomSequenceDataset(chunk_size=chunk_size,
                                                     df=df,
                                                     Y=y,
                                                     step_size=step_size,
                                                     noise=False,
                                                     times=1)

            dataset_list.append(dataset)

        dataset = ConcatDataset(dataset_list)

        train_loader, valid_loader = datasets.split_dataset(
            dataset=dataset,
            batch_size=batch_size,
            val_ratio=val_ratio,
            shuffle=True)

    optimizer = Adam(model.parameters(),
                     lr=lr,
                     weight_decay=float(weight_decay))

    if c_loss:
        criterion = custom_loss.mse_AIFrenz_torch
    else:
        criterion = nn.MSELoss()

    training_time = time.time()
    epoch = 0
    y_df = Dataframe.get_pretrain_df()[Y_list]
    y18_df = Dataframe.get_y18_df()[['Y18']]

    while epoch < EPOCH:

        print(f'\r Y: {Y} \
              chunk size: {chunk_size} \
              transfer: {transfer_learning}')

        epoch += 1
        train_loss_per_epoch, train_loss_list_per_batch, batch_list = train(
            model=model,
            train_loader=train_loader,
            criterion=criterion,
            optimizer=optimizer,
            epoch=epoch,
            transfer_learning=transfer_learning,
            attention=attention,
            freeze_name='transfer_layer')

        valid_loss = valid(model=model,
                           valid_loader=valid_loader,
                           criterion=criterion,
                           attention=attention)

        iter_time = time.time() - training_time

        print(
            f'\r Epoch: {epoch:3d}/{str(EPOCH):3s}\t',
            f'train time: {int(iter_time//60):2d}m {iter_time%60:5.2f}s\t'
            f'avg train loss: {train_loss_per_epoch:7.3f}\t'
            f'valid loss: {valid_loss:7.3f}')

        checkpoint.save_log(batch_list, epoch, train_loss_list_per_batch,
                            train_loss_per_epoch, valid_loss)

        early_stop, is_best = early_stopping(valid_loss)
        checkpoint.save_checkpoint(model, optimizer, is_best)

        vis.print_training(EPOCH, epoch, training_time, train_loss_per_epoch,
                           valid_loss, patience, early_stopping.counter)
        vis.loss_plot(checkpoint)

        print('-----' * 17)

        y_true, y_pred, y_idx = predict.trainset_predict(
            model=model,
            data_dir=data_dir,
            Y=Y_list[0],
            chunk_size=chunk_size,
            attention=attention,
            window_size=window_size)

        y18_true, y18_pred, y18_idx = predict.trainset_predict(
            model=model,
            data_dir=data_dir,
            Y='Y18',
            chunk_size=chunk_size,
            attention=attention,
            window_size=window_size)

        y_df['pred'] = y_pred
        y18_df['pred'] = y18_pred

        vis.predict_plot(y_df, 'pre')
        vis.predict_plot(y18_df, 'trans')
        vis.print_error()

        if early_stop:

            break

    if transfer_learning:
        checkpoint.load_model(model, transfer_learningd=True)
    else:
        checkpoint.load_model(model, transfer_learningd=False)

    y_true, y_pred, y_idx = predict.trainset_predict(model=model,
                                                     data_dir=data_dir,
                                                     Y=Y_list[0],
                                                     chunk_size=chunk_size,
                                                     attention=attention,
                                                     window_size=window_size)

    y18_true, y18_pred, y18_idx = predict.trainset_predict(
        model=model,
        data_dir=data_dir,
        Y='Y18',
        chunk_size=chunk_size,
        attention=attention,
        window_size=window_size)

    y_df['pred'] = y_pred
    y18_df['pred'] = y18_pred

    vis.predict_plot(y_df, 'pre')
    vis.predict_plot(y18_df, 'trans')
    vis.print_error()

    if pred:

        predict.test_predict(model=model,
                             chunk_size=chunk_size,
                             filename=filename,
                             attention=attention)
예제 #23
0
    whole_corpus = datasets.TableFeatures(corpus,
                                            sherlock_feature_groups, 
                                            topic_feature=topic, 
                                            label_enc=label_enc, 
                                            id_filter=None,
                                            max_col_count=MAX_COL_COUNT)

    if args.mode!='eval':
        train = copy.copy(whole_corpus).set_filter(train_ids)
        train_list.append(train)

    test = copy.copy(whole_corpus).set_filter(test_ids)
    test_list.append(test)

if args.mode!='eval':
    training_data = ConcatDataset(train_list)

testing_data = ConcatDataset(test_list)


print('----------------------------------')
end_loading = time.time()
print("Loading done:", end_loading - start_loading)
time_record['Load'] = end_loading - start_loading



model = CRF(len(valid_types) , batch_first=True).to(device)

####################
# Training 
예제 #24
0
    def __init__(self, config):

        print('Batch size: ', config.batch_size)

        print('read background_dataset!' + '\n')

        background_dataset = BackgroundDataset(
            [config.PRW_img_path, config.CUHK_SYSU_path])
        self.background_dataloader = DataLoader(dataset=background_dataset,
                                                batch_size=config.batch_size,
                                                shuffle=True,
                                                num_workers=config.worker_num,
                                                drop_last=True)

        print('read surreal_dataset dataset!' + '\n')

        # 读取真实的uvmap
        surreal_dataset = RealTextureDataset(pkl_path=config.texture_pkl_path)
        self.surreal_dataloader = DataLoader(dataset=surreal_dataset,
                                             batch_size=config.batch_size,
                                             shuffle=True,
                                             num_workers=config.worker_num,
                                             drop_last=True)

        print('read reid_dataset dataset!' + '\n')

        print('read market_dataset dataset!' + '\n')

        dataset = Market1501Dataset()

        if config.triplet:

            print('4*4!')

            trainloader = DataLoader(ImageData(dataset.train),
                                     sampler=RandomIdentitySampler(
                                         dataset.train, config.num_instance),
                                     batch_size=config.batch_size,
                                     num_workers=config.worker_num,
                                     drop_last=True)

            queryloader = DataLoader(ImageData(dataset.query),
                                     sampler=RandomIdentitySampler(
                                         dataset.query, config.num_instance),
                                     batch_size=config.batch_size,
                                     num_workers=config.worker_num,
                                     drop_last=True)

            galleryloader = DataLoader(ImageData(dataset.gallery),
                                       sampler=RandomIdentitySampler(
                                           dataset.gallery,
                                           config.num_instance),
                                       batch_size=config.batch_size,
                                       num_workers=config.worker_num,
                                       drop_last=True)

            self.reid_dataloader = [trainloader, queryloader, galleryloader]
            '''
            prw_dataset = PRWDataset(pkl_path = config.frames_mat_pkl_path,num_instance=4)
            market_dataset = Market1501Dataset(pkl_path = config.Market_all_pkl,num_instance=4)
        
            reid_dataset = ConcatDataset([market_dataset, prw_dataset])
            
            
            #market_dataset = Market1501Dataset(pkl_path = '/unsullied/sharefs/zhongyunshan/isilon-home/datasets/Texture/market_1501_train.pkl',num_instance=4)
            
            market_dataset = Market1501Dataset(pkl_path = config.Market_all_pkl,num_instance=4)
            reid_dataset = market_dataset
            
            
            self.reid_dataloader = DataLoader(dataset=reid_dataset, batch_size=int(config.batch_size/config.num_instance),
                                          shuffle=True, num_workers=config.worker_num, drop_last=True)
            '''
        else:

            print('16*1!')
            prw_dataset = PRWDataset(pkl_path=config.frames_mat_pkl_path,
                                     num_instance=1)
            market_dataset = Market1501Dataset(pkl_path=config.Market_all_pkl,
                                               num_instance=1)

            reid_dataset = ConcatDataset([market_dataset, prw_dataset])

            self.reid_dataloader = DataLoader(dataset=reid_dataset,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              num_workers=config.worker_num,
                                              drop_last=True)

        # read the mask of face and hand
        texture_mask = TextureMask(size=64)  # 设定读取64*64大小的mask
        self.face_mask = texture_mask.get_mask('face')
        self.hand_mask = texture_mask.get_mask('hand')
        self.mask = self.face_mask + self.hand_mask

        self.gpu_available = torch.cuda.is_available()
        if self.gpu_available:
            print('Use GPU! GPU num: ', config.gpu_nums)
            gpu_ids = [i for i in range(config.gpu_nums)]

        # 读取pretrained model
        if config.pretrained_model_path is None:
            print('No resume train model!')
            self.generator = UNet(input_channels=3,
                                  output_channels=3,
                                  gpu_ids=gpu_ids)

        else:
            print('resume train model!')
            print(config.epoch_now)
            self.generator = torch.load(config.pretrained_model_path)

        if config.reid_model == 'reid_loss_market1501':
            print('origin model!')
            from loss.reid_loss_market1501 import ReIDLoss
            config.num_classes = 1501
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids,
                                      margin=config.margin)

        elif config.reid_model == 'PCB_intern_loss':
            print('PCB_intern_loss!')

            from loss.PCB_intern_loss import ReIDLoss
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids,
                                      margin=config.margin)

        elif config.reid_model == 'ImageNet_Resnet':
            print('ImageNet_Resnet!')
            print('layer: ', config.layer)
            from loss.ImageNet_Resnet import ReIDLoss
            self.reid_loss = ReIDLoss(gpu_ids=gpu_ids)

        elif config.reid_model == 'PCB_MiddleFeature':
            print('PCB_MiddleFeature!')
            print('layer: ', config.layer)
            from loss.PCB_MiddleFeature import ReIDLoss
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids,
                                      margin=config.margin,
                                      layer=config.layer)

        elif config.reid_model == 'NoPCB_Resnet':
            print('NoPCB_Resnet!')
            print('layer: ', config.layer)
            from loss.NoPCB_Resnet import ReIDLoss
            self.reid_loss = ReIDLoss(gpu_ids=gpu_ids)

        elif config.reid_model == 'NoPCB_Resnet_deepfashion':
            print('NoPCB_Resnet_deepfashion!')
            print('layer: ', config.layer)
            from loss.NoPCB_Resnet_deepfashion import ReIDLoss
            self.reid_loss = ReIDLoss(gpu_ids=gpu_ids)

        elif config.reid_model == 'PCB_softmax':
            print('PCB_softmax!')
            from loss.PCB_softmax_loss import ReIDLoss
            config.num_classes = 1501
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids,
                                      margin=config.margin)

        elif config.reid_model == 'PCB_PerLoss':
            print('PCB_PerLoss!')

            from loss.PCB_PerLoss import ReIDLoss
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids)

        elif config.reid_model == 'PCB_AllCat':
            print('PCB_AllCat!')

            from loss.PCB_AllCat import ReIDLoss
            self.reid_loss = ReIDLoss(model_path=config.reid_weight_path,
                                      num_classes=config.num_classes,
                                      gpu_ids=gpu_ids,
                                      margin=config.margin)

        else:
            raise KeyError('{} not in keys!'.format(config.reid_model))

        if self.gpu_available:

            self.generator = nn.DataParallel(self.generator)  # multi-GPU

            self.generator = self.generator.cuda()
            self.reid_loss = self.reid_loss.cuda()

            self.mask = self.mask.cuda()

        self.texture2img = TextureToImage(action_npz=config.action_npz,
                                          batch_size=config.batch_size,
                                          use_gpu=self.gpu_available)

        # 计算face and hand 的共同 loss, 均方损失函数
        self.face_loss = nn.MSELoss()

        # Unet optimizer
        self.generator_optimizer = Adam(params=self.generator.parameters(),
                                        lr=config.learning_rate,
                                        weight_decay=config.weight_decay)

        configure(
            os.path.join(
                config.runs_log_path, config.log_name +
                str(datetime.datetime.now()).replace(' ', '_')))

        self.model_save_dir = os.path.join(
            config.model_log_path,
            config.log_name + str(datetime.datetime.now()).replace(' ', '_'))
        if not os.path.exists(self.model_save_dir):
            os.mkdir(self.model_save_dir)
예제 #25
0
def _create_dataloaders(config,
                        dataset_class,
                        tf1,
                        tf2,
                        partitions,
                        target_transform=None,
                        shuffle=False):
    train_imgs_list = []
    for train_partition in partitions:
        if "STL10" == config.dataset:
            train_imgs_curr = dataset_class(root=config.dataset_root,
                                            transform=tf1,
                                            split=train_partition,
                                            target_transform=target_transform)
        else:
            train_imgs_curr = dataset_class(download=True,
                                            root=config.dataset_root,
                                            transform=tf1,
                                            train=train_partition,
                                            target_transform=target_transform)

        if hasattr(config, "mix_train"):
            if config.mix_train and (train_partition == "train+unlabeled"):
                train_imgs_curr = reorder_train_deterministic(train_imgs_curr)
        train_imgs_list.append(train_imgs_curr)

    train_imgs = ConcatDataset(train_imgs_list)
    train_dataloader = torch.utils.data.DataLoader(
        train_imgs,
        batch_size=int(config.dataloader_batch_sz),
        shuffle=shuffle,
        num_workers=6,
        drop_last=False,
        persistent_workers=True)

    if not shuffle:
        assert (isinstance(train_dataloader.sampler,
                           torch.utils.data.sampler.SequentialSampler))
    dataloaders = [train_dataloader]

    for d_i in range(config.num_dataloaders):
        print("Creating auxiliary dataloader ind %d out of %d time %s" %
              (d_i, config.num_dataloaders, datetime.now()))
        sys.stdout.flush()

        train_tf_imgs_list = []
        for train_partition in partitions:
            if "STL10" == config.dataset:
                train_imgs_tf_curr = dataset_class(
                    download=True,
                    root=config.dataset_root,
                    transform=tf2,  # random per call
                    split=train_partition,
                    target_transform=target_transform)
            else:
                train_imgs_tf_curr = dataset_class(
                    download=True,
                    root=config.dataset_root,
                    transform=tf2,
                    train=train_partition,
                    target_transform=target_transform)

            if hasattr(config, "mix_train"):
                if config.mix_train and (train_partition == "train+unlabeled"):
                    train_imgs_tf_curr = reorder_train_deterministic(
                        train_imgs_tf_curr)
            train_tf_imgs_list.append(train_imgs_tf_curr)

        train_imgs_tf = ConcatDataset(train_tf_imgs_list)
        train_tf_dataloader = \
            torch.utils.data.DataLoader(train_imgs_tf,
                                        batch_size=int(config.dataloader_batch_sz),
                                        shuffle=shuffle,
                                        num_workers=6,
                                        drop_last=False, persistent_workers=True)

        if not shuffle:
            assert (isinstance(train_tf_dataloader.sampler,
                               torch.utils.data.sampler.SequentialSampler))
        assert (len(train_dataloader) == len(train_tf_dataloader))
        dataloaders.append(train_tf_dataloader)

    num_train_batches = len(dataloaders[0])
    print("Length of datasets vector %d" % len(dataloaders))
    print("Number of batches per epoch: %d" % num_train_batches)
    sys.stdout.flush()

    return dataloaders
예제 #26
0
def dataset_loader(dataset_name,
                   data_dir,
                   categories,
                   raw_input_dims,
                   split,
                   text_dim,
                   text_feat,
                   max_text_words,
                   max_expert_tokens,
                   vocab,
                   attr_vocab,
                   use_val=False):
    dataset_classes = {"CE": CE}
    if len(categories) > 1 and split == 'train':
        dataset_list = []
        for cat in categories:
            dataset = dataset_classes[dataset_name](
                data_dir=data_dir,
                text_dim=text_dim,
                category=cat,
                raw_input_dims=raw_input_dims,
                split=split,
                text_feat=text_feat,
                max_text_words=max_text_words,
                max_expert_tokens=max_expert_tokens,
                vocab=vocab,
                attr_vocab=attr_vocab,
                transforms=transforms.Compose([
                    transforms.RandomCrop(224),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    transforms.Normalize((0.485, 0.456, 0.406),
                                         (0.229, 0.224, 0.225))
                ]))
            dataset_list.append(dataset)

        if use_val:
            for cat in categories:
                dataset = dataset_classes[dataset_name](
                    data_dir=data_dir,
                    text_dim=text_dim,
                    category=cat,
                    raw_input_dims=raw_input_dims,
                    split='val',
                    text_feat=text_feat,
                    max_text_words=max_text_words,
                    max_expert_tokens=max_expert_tokens,
                    vocab=vocab,
                    attr_vocab=attr_vocab,
                    transforms=transforms.Compose([
                        transforms.RandomCrop(224),
                        transforms.RandomHorizontalFlip(),
                        transforms.ToTensor(),
                        transforms.Normalize((0.485, 0.456, 0.406),
                                             (0.229, 0.224, 0.225))
                    ]))
                dataset_list.append(dataset)

        dataset = ConcatDataset(dataset_list)
    # elif len(categories) > 1 and (split in ['val', 'val_trg', 'test', 'test_trg']):
    elif split in ['val', 'val_trg', 'test', 'test_trg']:
        dataset_list = []
        for cat in categories:
            dataset = dataset_classes[dataset_name](
                data_dir=data_dir,
                text_dim=text_dim,
                category=cat,
                raw_input_dims=raw_input_dims,
                split=split,
                text_feat=text_feat,
                max_text_words=max_text_words,
                max_expert_tokens=max_expert_tokens,
                vocab=vocab,
                attr_vocab=attr_vocab,
                transforms=transforms.Compose([
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize((0.485, 0.456, 0.406),
                                         (0.229, 0.224, 0.225))
                ]))
            dataset_list.append(dataset)
        dataset = dataset_list
    else:
        dataset = dataset_classes[dataset_name](
            data_dir=data_dir,
            text_dim=text_dim,
            category=categories[0],
            raw_input_dims=raw_input_dims,
            split=split,
            text_feat=text_feat,
            max_text_words=max_text_words,
            max_expert_tokens=max_expert_tokens,
            vocab=vocab,
            attr_vocab=attr_vocab,
            transforms=transforms.Compose([
                transforms.RandomCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406),
                                     (0.229, 0.224, 0.225))
            ]))
    return dataset
예제 #27
0
cancer_targets = np.ones((cancer_set.shape[0])).astype(np.int64)
not_cancer_targets = np.zeros((not_cancer_set.shape[0])).astype(np.int64)
not_cancer_dataset = TensorDataset(
    torch.from_numpy(not_cancer_set.swapaxes(1, 3).swapaxes(2, 3)).float(),
    torch.from_numpy(not_cancer_targets), torch.from_numpy(seg_set))
del not_cancer_set
del seg_set

cancer_dataset = TensorDataset(
    torch.from_numpy(cancer_set.swapaxes(1, 3).swapaxes(2, 2)).float(),
    torch.from_numpy(cancer_targets),
    torch.from_numpy(np.zeros((len(cancer_set), 299, 299), dtype=np.bool)))
del cancer_set

gc.collect()
complete_dataset = ConcatDataset((not_cancer_dataset, cancer_dataset))
num_total = len(complete_dataset)
num_train = int(0.8 * num_total)
num_val = int(0.1 * num_total)
num_test = num_total - num_train - num_val
torch.manual_seed(0)
train_dataset, test_dataset, val_dataset = torch.utils.data.random_split(
    complete_dataset, [num_train, num_test, num_val])
datasets = {'train': train_dataset, 'test': test_dataset, 'val': val_dataset}
dataset_sizes = {
    'train': len(train_dataset),
    'test': len(test_dataset),
    'val': len(val_dataset)
}
dataloaders = {
    x: torch.utils.data.DataLoader(datasets[x],
예제 #28
0
def build_dataset(cfg, transforms, split='train', num_tta=0):
    assert split in ['train', 'valid', 'test']
    dataset_config = cfg['dataset']

    num_class = dataset_config['num_class']
    fold = dataset_config['fold']
    batch_size = dataset_config['batch_size']
    num_workers = dataset_config['num_workers']
    use_upsampling = dataset_config['upsampling']
    is_test = split == 'test'

    if split == 'test':
        df = pd.read_csv(dataset_map['test'])
        image_dir = dataset_map['test_images']
    else:
        df = pd.read_csv(dataset_map['fold'])
        image_dir = dataset_map['train_images']

    if dataset_config['use_original']:
        if split == 'train':
            df = df[df['fold'] != fold]
            if use_upsampling: df = upsampling(df)
        elif split == 'valid':
            df = df[df['fold'] == fold]

    if split == 'valid':
        if not dataset_config['valid_with_both']:
            if dataset_config['valid_with_large']:
                df = df[df['large']]
            elif dataset_config['valid_with_small']:
                df = df[~df['large']]
    sampler_df = [df]
    dataset = BlindDataset(image_dir=image_dir,
                           df=df,
                           transforms=transforms,
                           num_class=num_class,
                           is_test=is_test,
                           num_tta=num_tta)

    if split == 'train' and dataset_config['use_diabetic_retinopathy']:
        diabetic_df = pd.read_csv(diabetic_retinopathy_map['train'],
                                  index_col='Unnamed: 0')
        del diabetic_df['Unnamed: 0.1']
        if use_upsampling:
            diabetic_df = upsampling(diabetic_df)  # up sampling for diabetic
        diabetic_dataset = BlindDataset(
            image_dir=diabetic_retinopathy_map['train_images'],
            df=diabetic_df,
            transforms=transforms,
            num_class=num_class,
            is_test=is_test)

        if not dataset_config['use_original']:
            dataset = diabetic_dataset
            sampler_df = [diabetic_df]
        else:
            sampler_df += [diabetic_df]
            dataset = ConcatDataset([dataset, diabetic_dataset])

    if split == 'train' and \
        (dataset_config['use_class_ratio'] or dataset_config['use_dataset_ratio']):
        sampler = get_sampler(dataset_config['use_class_ratio'],
                              dataset_config['use_dataset_ratio'],
                              dataset_config['class_ratio'],
                              dataset_config['dataset_ratio'], sampler_df)
    else:
        sampler = None

    data_loader = DataLoader(
        dataset,
        shuffle=True if sampler is None else False,
        batch_size=batch_size,
        num_workers=num_workers,
        sampler=sampler,
    )
    return data_loader
예제 #29
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--data_cache_name",
        default=None,
        type=str,
        help="The name of cached data",
    )
    parser.add_argument(
        "--language",
        default=None,
        type=str,
        required=True,
        help=
        "Evaluation language. Also train language if `train_language` is set to None.",
    )
    parser.add_argument(
        "--train_language",
        default=None,
        type=str,
        help="Train language if is different of the evaluation language.")

    parser.add_argument("--train_tasks",
                        default=None,
                        type=str,
                        help="Training tasks in together finetuning.")

    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")

    parser.add_argument("--gpu_id", default="", type=str, help="GPU id")

    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--task_ratio",
        default=1.0,
        type=float,
        help="ratio of tasks between 0-1",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--break_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )

    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument("--logging_each_epoch",
                        action="store_true",
                        help="Whether to log after each epoch.")
    parser.add_argument("--logging_steps_in_sample",
                        type=int,
                        default=-1,
                        help="log every X samples.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # preprocess args
    if args.train_language is None or args.train_language == "all":
        args.train_language = args.language

    assert not (args.logging_steps != -1 and args.logging_steps_in_sample != -1
                ), "these two parameters can't both be setted"
    if args.logging_steps == -1 and args.logging_steps_in_sample != -1:
        total_batch_size = args.n_gpu * args.per_gpu_train_batch_size * args.gradient_accumulation_steps
        args.logging_steps = args.logging_steps_in_sample // total_batch_size

    # Set seed
    set_seed(args)

    # Prepare XNLI task
    #    args.task_name = "xnli"
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))

    num_labels = []
    for task_name in args.train_tasks.split(","):
        processor = processors[task_name](language=TASK_LANGS[task_name],
                                          train_language=args.train_language)
        args.output_mode = output_modes[args.task_name]
        label_list = processor.get_labels()
        num_labels.append(len(label_list))

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        task_dataset_list = []

        train_tasks = args.train_tasks.split(",")
        for task_name in train_tasks:

            train_langs = args.train_language.split(',')
            dataset_list = []
            for lang in train_langs:
                lg_train_dataset, guids = load_and_cache_examples(
                    args, task_name, tokenizer, lang, split="train")
                dataset_list.append(lg_train_dataset)
            train_dataset = ConcatDataset(dataset_list)
            task_dataset_list.append(train_dataset)
        # train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, args.train_language, split="train")

        global_step, tr_loss = train(args, task_dataset_list, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}

    return results
예제 #30
0
 def test_concat_two_singletons(self):
     result = ConcatDataset([[0], [1]])
     self.assertEqual(2, len(result))
     self.assertEqual(0, result[0])
     self.assertEqual(1, result[1])