def datasets(self, params: SemiSupervisedParams): dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) indexs, un_indexs, val_indexs = splits.semi_split( train_y, n_percls=params.n_percls, val_size=params.val_size, repeat_sup=False) self.logger.info('sup/unsup/val : {}'.format( (len(indexs), len(un_indexs), len(val_indexs)))) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) sup_set = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_y().subset(indexs)) if len(sup_set) < params.batch_size: sup_set.virtual_sample(params.batch_size) unsup_set = (DatasetBuilder(train_x, train_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(un_indexs)) self.cl_set = unsup_set sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) self.sup_dataloader = sup_dataloader unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size * params.uratio, num_workers=1, shuffle=True) self.unsup_dataloader = DataBundler().add(unsup_dataloader).to( self.device) val_dataloader = (DatasetBuilder( train_x[val_indexs], train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add( unsup_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: GlobalParams): from torchvision import transforms dataset_fn = datasets[params.dataset] test_x, testy = dataset_fn(False) train_x, train_y = dataset_fn(True) train_idx, val_idx = splits.train_val_split(train_y, val_size=params.val_size) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) train_transform = transforms.Compose([ transforms.RandomResizedCrop(size=32, scale=(0.2, 1.)), transforms.RandomHorizontalFlip(), transforms.RandomApply( [transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std), ]) test_dataloader = (DatasetBuilder( test_x, testy).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) train_dataloader = (DatasetBuilder( train_x, train_y).toggle_id().add_x(transform=train_transform).add_x( transform=train_transform).add_y().subset(train_idx)) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_dataloader) else: sampler = None self.train_size = len(train_dataloader) train_dataloader = train_dataloader.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_datalaoder = (DatasetBuilder(train_x, train_y).add_x( transform=toTensor).add_y().subset(val_idx).DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) self.regist_databundler(train=train_dataloader, eval=val_datalaoder, test=test_dataloader) print('dataloader in rank {}'.format(self.params.local_rank)) print(self.params.local_rank, self.train_dataloader) print(self.params.local_rank, self._databundler_dict) print(self.params.local_rank, train_dataloader) self.to(self.device)
def datasets(self, params: DivideMixParams): from data.dataxy import datasets dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False, params.noisy_type self.train_set_pack = [train_x, np.array(train_y), noisy_y] self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) train_dataloader = train_set.DataLoader(batch_size=params.batch_size * 2, num_workers=params.num_workers, shuffle=True) from thexp import DataBundler self.eval_train_dataloader = (DataBundler().add( DatasetBuilder(train_x, noisy_y).toggle_id().add_x( transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)).to(self.device)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers // 2, shuffle=False)) self.regist_databundler(train=train_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: SemiSupervisedParams): dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) indexs, un_indexs, val_indexs = splits.semi_split( train_y, n_percls=params.n_percls, val_size=5000, repeat_sup=False) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) sup_set = (DatasetBuilder( train_x, train_y).add_x(transform=weak).add_y().subset(indexs)) params.K = params.default(2, True) unsup_set = DatasetBuilder(train_x, train_y) for _ in range(params.K): unsup_set.add_x(transform=weak) unsup_set = unsup_set.add_y().subset(un_indexs) sup_dataloader = sup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) unsup_dataloader = unsup_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True) val_dataloader = (DatasetBuilder( train_x[val_indexs], train_y[val_indexs]).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().cycle(sup_dataloader).add( unsup_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: GlobalParams): dataset_fn = datasets[params.dataset] test_x, testy = dataset_fn(False) train_x, train_y = dataset_fn(True) train_idx, val_idx = splits.train_val_split(train_y, val_size=params.val_size) mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) test_dataloader = (DatasetBuilder( test_x, testy).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) train_dataloader = (DatasetBuilder(train_x, train_y).toggle_id().add_x( transform=weak).add_x(transform=strong).add_y().subset(train_idx)) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_dataloader) else: sampler = None self.train_size = len(train_dataloader) train_dataloader = train_dataloader.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_datalaoder = (DatasetBuilder(train_x, train_y).add_x( transform=toTensor).add_y().subset(val_idx).DataLoader( batch_size=params.batch_size, num_workers=params.num_workers)) self.regist_databundler(train=train_dataloader, eval=val_datalaoder, test=test_dataloader) print('dataloader in rank {}'.format(self.params.local_rank)) print(self.params.local_rank, self.train_dataloader) print(self.params.local_rank, self._databundler_dict) print(self.params.local_rank, train_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): self.rnd.mark('kk') params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) from data.constant import norm_val mean, std = norm_val.get(params.dataset, [None, None]) from data.transforms import ToNormTensor toTensor = ToNormTensor(mean, std) from data.transforms import Weak weak = Weak(mean, std) from data.transforms import Strong dataset_fn = datasets.datasets[params.dataset] train_x, train_y = dataset_fn(True) train_y = np.array(train_y) from thexp import DatasetBuilder from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) clean_mask = (train_y == noisy_y) noisy_mask = np.logical_not(clean_mask) noisy_mask = np.where(noisy_mask)[0] nmask_a = noisy_mask[:len(noisy_mask) // 2] nmask_b = noisy_mask[len(noisy_mask) // 2:] clean_x, clean_y = train_x[clean_mask], noisy_y[clean_mask] clean_true_y = train_y[clean_mask] raw_x, raw_true_y = train_x[nmask_a], train_y[nmask_a] raw_y = noisy_y[nmask_a] change_x, change_true_y, change_y = train_x[nmask_b], train_y[ nmask_b], noisy_y[nmask_b] first_x, first_y, first_true_y = ( clean_x + raw_x, np.concatenate([clean_y, raw_y]), np.concatenate([clean_true_y, raw_true_y]), ) second_x, second_y, second_true_y = ( clean_x + change_x, np.concatenate([clean_y, change_y]), np.concatenate([clean_true_y, change_true_y]), ) first_set = (DatasetBuilder(first_x, first_true_y).add_labels( first_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) second_set = (DatasetBuilder(second_x, second_true_y).add_labels( second_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) self.first_dataloader = first_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = second_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = second_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second = False self.regist_databundler(train=self.first_dataloader) self.cur_set = 0 self.to(self.device)
def datasets(self, params: MnistImblanceParams): super().datasets(params) from data.dataxy import mnist test_x, test_y = mnist(False) train_x, train_y = mnist(True) train_y = np.array(train_y, dtype=np.float32) test_y = np.array(test_y, dtype=np.float32) # search and mask sample with class [4, 9] train_mask_lis = [ np.where(train_y == i)[0] for i in params.train_classes ] test_mask_lis = [ np.where(test_y == i)[0] for i in params.train_classes ] for new_cls, i in enumerate(params.train_classes): train_y[train_mask_lis[new_cls]] = new_cls test_y[test_mask_lis[new_cls]] = new_cls train_mask = np.concatenate(train_mask_lis) test_mask = np.concatenate(test_mask_lis) test_x, test_y = test_x[test_mask], test_y[test_mask] train_x, train_y = train_x[train_mask], train_y[train_mask] # split train/val dataset train_ids, val_ids = splits.train_val_split(train_y, val_size=params.val_size) train_x, val_x = train_x[train_ids], train_x[val_ids] train_y, val_y = train_y[train_ids], train_y[val_ids] # reduce size of second class train_mask_lis = [ np.where(train_y == i)[0] for i in range(len(params.train_classes)) ] sec_cls_size = int( (1 - params.train_proportion) * len(train_mask_lis[0])) train_mask_lis[1] = train_mask_lis[1][:sec_cls_size] train_mask = np.concatenate(train_mask_lis) train_x, train_y = train_x[train_mask], train_y[train_mask] toTensor = ToNormTensor((0.1307, ), (0.3081, )) train_dataloader = (DatasetBuilder( train_x, train_y).add_x(toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) val_dataloader = (DatasetBuilder( val_x, val_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=DataBundler().add( train_dataloader).cycle(val_dataloader).zip_mode(), eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) import numpy as np dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) # train_ids, query_ids = splits.train_val_split(train_y, val_size=params.val_size) query_ids, train_ids, eval_ids = splits.semi_split( train_y, params.query_size // params.n_classes, val_size=params.val_size, repeat_sup=False) # train_ids = train_ids[:3000] self.train_size = len(train_ids) train_x, query_x, eval_x = train_x[train_ids], train_x[ query_ids], train_x[eval_ids] train_y, query_y, eval_y = train_y[train_ids], train_y[ query_ids], train_y[eval_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() self.logger.info(len(train_y), len(train_x), len(noisy_y)) train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=strong)) params.K = params.default(0, True) for _ in range(params.K): train_set.add_x(transform=weak) train_set = (train_set.add_y().add_y(source='noisy_y')) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_a = sampler else: sampler = None train_set = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) query_set = (DatasetBuilder(query_x, query_y).add_x(transform=strong).add_y()) if params.distributed: from torch.utils.data.distributed import DistributedSampler sampler = DistributedSampler(train_set, num_replicas=4) self.sampler_b = sampler else: sampler = None query_set = query_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, sampler=sampler, shuffle=not params.distributed) val_dataloader = ( DatasetBuilder(eval_x, eval_y).add_x(transform=toTensor).add_y(). DataLoader( batch_size=params.batch_size, shuffle= False, # do not shuffle # no shuffle for probe, so a batch is class balanced.(?) num_workers=params.num_workers)) train_dataloader = DataBundler().add(train_set).cycle( query_set).zip_mode() test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: NoisyParams): self.rnd.mark('fix_noisy') params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) dataset_fn = datasets[params.dataset] test_x, test_y = dataset_fn(False) train_x, train_y = dataset_fn(True) train_ids, val_ids = splits.train_val_split(train_y, val_size=5000) train_x, val_x = train_x[train_ids], train_x[val_ids] train_y, val_y = train_y[train_ids], train_y[val_ids] mean, std = norm_val.get(params.dataset, [None, None]) toTensor = ToNormTensor(mean, std) weak = Weak(mean, std) strong = Strong(mean, std) if params.noisy_type == 'asymmetric': from data.noisy import asymmetric_noisy noisy_y = asymmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) elif params.noisy_type == 'symmetric': from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) else: assert False clean_mask = train_y == noisy_y self.logger.info('noisy acc = {}'.format((train_y == noisy_y).mean())) self.rnd.shuffle() train_set = (DatasetBuilder(train_x, train_y).add_labels( noisy_y, 'noisy_y').toggle_id().add_x(transform=weak).add_x( transform=strong).add_y().add_y(source='noisy_y')) from thextra.noisy_sampler import NoisySampler sampler = None if params.order_sampler: sampler = NoisySampler(train_set, clean_mask) self.train_size = len(train_set) train_dataloader = train_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, sampler=sampler, shuffle=True) val_dataloader = (DatasetBuilder( val_x, val_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) test_dataloader = (DatasetBuilder( test_x, test_y).add_x(transform=toTensor).add_y().DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, shuffle=True)) self.regist_databundler(train=train_dataloader, eval=val_dataloader, test=test_dataloader) self.to(self.device)
def datasets(self, params: EvalParams): params.noisy_type = params.default('symmetric', True) params.noisy_ratio = params.default(0.2, True) from data.constant import norm_val mean, std = norm_val.get(params.dataset, [None, None]) from data.transforms import ToNormTensor toTensor = ToNormTensor(mean, std) from data.transforms import Weak weak = Weak(mean, std) from data.transforms import Strong dataset_fn = datasets.datasets[params.dataset] train_x, train_y = dataset_fn(True) train_y = np.array(train_y) from thexp import DatasetBuilder from data.noisy import symmetric_noisy noisy_y = symmetric_noisy(train_y, params.noisy_ratio, n_classes=params.n_classes) clean_mask = (train_y == noisy_y) noisy_mask = np.logical_not(clean_mask) if params.eval_mode in ['full', 'same_epoch', 'same_acc']: first_x, first_y = train_x[clean_mask], noisy_y[clean_mask] first_true_y = train_y[clean_mask] elif params.eval_mode in ['mix', 'raw', 'direct']: first_x, first_y = train_x, noisy_y first_true_y = train_y else: assert False second_x, second_true_y = train_x[noisy_mask], train_y[noisy_mask] second_y = noisy_y[noisy_mask] self.logger.info('noisy acc = {}'.format( (first_true_y == first_y).mean())) self.logger.info('noisy acc = {}'.format( (second_true_y == second_y).mean())) self.rnd.shuffle() first_set = (DatasetBuilder(first_x, first_true_y).add_labels( first_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) noisy_set = (DatasetBuilder(second_x, second_true_y).add_labels( second_y, 'noisy_y').toggle_id().add_x( transform=weak).add_y().add_y(source='noisy_y')) first_dataloader = first_set.DataLoader(batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second_dataloader = noisy_set.DataLoader( batch_size=params.batch_size, num_workers=params.num_workers, drop_last=True, shuffle=True) self.second = False self.regist_databundler(train=first_dataloader) self.to(self.device) if params.eval_mode == 'direct': self.change_dataset()