def __init__(self, r, transform, mode, pred=[], probability=[], log=''):
        self.r = r  # noise ratio
        self.transform = transform
        self.mode = mode
        train_loader, val_loader = get_chexpert_loaders(r, batch_size=32)

        if self.mode == 'test':
            self.test_data = val_loader.get_all_samples()
            self.test_label = val_loader.get_all_real_ground_truth()
        else:
            train_label = train_loader.get_all_real_ground_truth()
            train_data = train_loader.get_all_samples()
            noise_label = train_loader.get_all_labels()

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            elif self.mode == 'labeled':
                pred_idx = pred.nonzero()[0]
                self.probability = [probability[i] for i in pred_idx]

                clean = (np.array(noise_label) == np.array(train_label))
                auc_meter = AUCMeter()
                auc_meter.reset()
                auc_meter.add(probability, clean)
                auc, _, _ = auc_meter.value()
                log.write('Numer of labeled samples:%d   AUC:%.3f\n' %
                          (pred.sum(), auc))
                log.flush()

                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))
            elif self.mode == "unlabeled":
                pred_idx = (1 - pred).nonzero()[0]
                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))
예제 #2
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log='', teacher_idx=None, truncate_mode=None, refinement=None): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
        
        # For distill test
        self.teacher_idx = teacher_idx
        self.truncate_mode = truncate_mode
        self.train_label = None
        self.refinement = refinement
     
        if self.mode=='test':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']                            
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            self.train_label = train_label
            
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise   
                fix_seed()
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                num_noise = int(self.r*50000)            
                noise_idx = idx[:num_noise]
                for i in range(50000):
                    if i in noise_idx:
                        if noise_mode=='sym':
                            if dataset=='cifar10': 
                                noiselabel = random.randint(0,9)
                            elif dataset=='cifar100':    
                                noiselabel = random.randint(0,99)
                            noise_label.append(noiselabel)
                        elif noise_mode=='asym':   
                            noiselabel = self.transition[train_label[i]]
                            noise_label.append(noiselabel)
                    else:    
                        noise_label.append(train_label[i]) 
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
                if self.truncate_mode == 'initial':
                    self.train_data = self.train_data[teacher_idx]
                    self.noise_label = [noise_label[i] for i in teacher_idx]
            else:                   
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    if self.truncate_mode == 'initial':
                        pred_idx = pred_idx.tolist()
                        teacher_idx = teacher_idx.tolist()
                        pred_idx = list(set(pred_idx) & set(teacher_idx))
                        pred_idx = torch.tensor(pred_idx)
                    
                    self.probability = [probability[i] for i in pred_idx]   
                    
                    clean = (np.array(noise_label)==np.array(train_label))                                                       
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)        
                    auc,_,_ = auc_meter.value()      
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()      
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0]
                    if self.truncate_mode == 'initial':
                        whole_idx = list(range(50000))
                        pred_idx = pred_idx.tolist()
                        teacher_idx = teacher_idx.tolist()
                        tmp_set = set(whole_idx) - set(teacher_idx)
                        tmp_set = tmp_set | set(pred_idx)
                        pred_idx = torch.tensor(list(tmp_set))
                    
                elif self.mode == "labeled_svd":
                    if self.refinement:
                        pred_idx = pred.nonzero()[0]
                        pred_idx_set = set(pred_idx.tolist())
                        teacher_idx_set = set(teacher_idx.tolist())
                        pred_idx = torch.tensor(list(pred_idx_set & teacher_idx_set))
                        self.probability = [probability[i] for i in pred_idx]
                        
                        clean = (np.array(noise_label)==np.array(train_label))
                        auc_meter = AUCMeter()
                        auc_meter.reset()
                        auc_meter.add(probability,clean)        
                        auc,_,_ = auc_meter.value()               
                        log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                        log.flush()
                    else:
                        pred_idx = teacher_idx
                        probability = torch.ones(50000,)
                        self.probability = [probability[i] for i in pred_idx]

                        log.write('Number of labeled samples (by svd) : %d' % teacher_idx.shape[0])
                
                elif self.mode == "unlabeled_svd":
                    if self.refinement:
                        clean_pred_idx = pred.nonzero()[0]
                        clean_pred_idx_set = set(clean_pred_idx.tolist())
                        teacher_idx_set = set(teacher_idx.tolist())
                        all_idx_set = set(range(50000))
                        pred_idx = torch.tensor(list(all_idx_set - (clean_pred_idx_set & teacher_idx_set)))                    
                    else:
                        pred_idx = torch.arange(0, 50000)
                        pred_idx_set = set(pred_idx.tolist()) - set(teacher_idx.tolist())
                        pred_idx = torch.tensor(list(pred_idx_set))
                
                self.train_data = train_data[pred_idx]
                self.noise_label = [noise_label[i] for i in pred_idx]
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))
예제 #3
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
     
        if self.mode=='test' or self.mode=='test_average':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']
        
        else:
            train_data = []
            train_label = []
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            if self.mode == 'eval' or self.mode == 'eval_average':
                self.eval_data = train_data[45000:]
                self.eval_label = train_label[45000:]

            else:
                if os.path.exists(noise_file):
                    noise_label = json.load(open(noise_file,"r"))
                else:    #inject noise   
                    noise_label = []
                    if self.mode in ['all', 'benchmark_all', 'benchmark_all_average']:
                      size = 50000
                    elif self.mode in ['train', 'benchmark', 'benchmark_average']:
                      size = 45000
                    idx = list(range(size))
                    random.shuffle(idx)
                    num_noise = int(self.r*size)            
                    noise_idx = idx[:num_noise]
                    for i in range(size):
                        if i in noise_idx:
                            if noise_mode=='sym':
                                if dataset=='cifar10': 
                                    noiselabel = random.randint(0,9)
                                elif dataset=='cifar100':    
                                    noiselabel = random.randint(0,99)
                                noise_label.append(noiselabel)
                            elif noise_mode=='asym':   
                                noiselabel = self.transition[train_label[i]]
                                noise_label.append(noiselabel)                    
                        else:    
                            noise_label.append(train_label[i])   
                    print("save noisy labels to %s ..."%noise_file)        
                    json.dump(noise_label,open(noise_file,"w"))       

                if self.mode in ['all', 'benchmark_all', 'benchmark_all_average']:
                    self.train_data = train_data
                    self.noise_label = noise_label
                    self.clean_label = train_label
            
                elif self.mode in ['train', 'benchmark', 'benchmark_average']:
                    self.train_data = train_data[:45000]
                    self.noise_label = noise_label[:45000]
                    self.clean_label = train_label[:45000]
                    
                else:                   
                    if self.mode == "labeled":
                        pred_idx = pred.nonzero()[0]
                        self.probability = [probability[i] for i in pred_idx]
                    
                        clean = (np.array(noise_label)==np.array(train_label))                                                       
                        auc_meter = AUCMeter()
                        auc_meter.reset()
                        auc_meter.add(probability,clean)        
                        auc,_,_ = auc_meter.value()               
                        log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                        log.flush()      
                    
                    elif self.mode == "unlabeled":
                        pred_idx = (1-pred).nonzero()[0]                                             
                
                    self.train_data = train_data[pred_idx]
                    self.noise_label = [noise_label[i] for i in pred_idx]                          
                    print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
    def __init__(self,
                 dataset,
                 noisy_dataset,
                 r,
                 on,
                 noise_mode,
                 root_dir,
                 noise_data_dir,
                 transform,
                 mode,
                 noise_file='',
                 pred=[],
                 probability=[],
                 log='',
                 targets=None):

        self.r = r  # total noise ratio
        self.on = on  # proportion of open noise
        self.transform = transform
        self.mode = mode
        self.transition = {
            0: 0,
            2: 0,
            4: 7,
            7: 7,
            1: 1,
            9: 1,
            3: 5,
            5: 3,
            6: 6,
            8: 8
        }  # class transition for asymmetric noise
        self.open_noise = None
        self.closed_noise = None

        if self.mode == 'test':
            if dataset == 'cifar10':
                test_dic = unpickle('%s/test_batch' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset == 'cifar100':
                test_dic = unpickle('%s/test' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']

        elif self.mode == 'clean':
            if not os.path.exists(noise_file):
                print('Noise not defined')
                return

            if self.open_noise is None or self.closed_noise is not None:
                noise = json.load(open(noise_file, "r"))
                noise_labels = noise['noise_labels']
                self.open_noise = noise['open_noise']
                self.closed_noise = noise['closed_noise']

            train_data = []
            train_label = []
            noise_data = []
            if dataset == 'cifar10':
                for n in range(1, 6):
                    dpath = '%s/data_batch_%d' % (root_dir, n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            open_noise = [item[0] for item in self.open_noise]
            clean_indices = list(
                set(range(50000)) - set(open_noise) - set(self.closed_noise))
            self.clean_data = train_data[clean_indices]
            self.clean_label = np.asarray(train_label)[clean_indices]

        else:
            train_data = []
            train_label = []
            noise_data = []
            if dataset == 'cifar10':
                for n in range(1, 6):
                    dpath = '%s/data_batch_%d' % (root_dir, n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset == 'cifar100':
                train_dic = unpickle('%s/train' % root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            if noisy_dataset == 'imagenet32':
                noise_data = None
            else:
                noise_data = unpickle(
                    '%s/train' % noise_data_dir)['data'].reshape(
                        (50000, 3, 32, 32)).transpose((0, 2, 3, 1))

            if os.path.exists(noise_file):
                noise = json.load(open(noise_file, "r"))
                noise_labels = noise['noise_labels']
                self.open_noise = noise['open_noise']
                self.closed_noise = noise['closed_noise']
                for cleanIdx, noisyIdx in noise['open_noise']:
                    if noisy_dataset == 'imagenet32':
                        train_data[cleanIdx] = np.asarray(
                            Image.open('{}/{}.png'.format(
                                noise_data_dir,
                                str(noisyIdx + 1).zfill(7)))).reshape(
                                    (32, 32, 3))
                    else:
                        train_data[cleanIdx] = noise_data[noisyIdx]
            else:
                #inject noise
                noise_labels = []  # all labels (some noisy, some clean)
                idx = list(range(50000))  # indices of cifar dataset
                random.shuffle(idx)
                num_total_noise = int(self.r * 50000)  # total amount of noise
                num_open_noise = int(
                    self.on *
                    num_total_noise)  # total amount of noisy/openset images
                if noisy_dataset == 'imagenet32':  # indices of openset source images
                    target_noise_idx = list(range(1281149))
                else:
                    target_noise_idx = list(range(50000))
                random.shuffle(target_noise_idx)
                self.open_noise = list(
                    zip(idx[:num_open_noise], target_noise_idx[:num_open_noise]
                        ))  # clean sample -> openset sample mapping
                self.closed_noise = idx[
                    num_open_noise:num_total_noise]  # closed set noise indices
                # populate noise_labels
                for i in range(50000):
                    if i in self.closed_noise:
                        if noise_mode == 'sym':
                            if dataset == 'cifar10':
                                noiselabel = random.randint(0, 9)
                            elif dataset == 'cifar100':
                                noiselabel = random.randint(0, 99)
                            noise_labels.append(noiselabel)
                        elif noise_mode == 'asym':
                            noiselabel = self.transition[train_label[i]]
                            noise_labels.append(noiselabel)
                    else:
                        noise_labels.append(train_label[i])
                # populate openset noise images
                for cleanIdx, noisyIdx in self.open_noise:
                    if noisy_dataset == 'imagenet32':
                        train_data[cleanIdx] = np.asarray(
                            Image.open('{}/{}.png'.format(
                                noise_data_dir,
                                str(noisyIdx + 1).zfill(7)))).reshape(
                                    (32, 32, 3))
                    else:
                        train_data[cleanIdx] = noise_data[noisyIdx]
                # write noise to a file, to re-use
                noise = {
                    'noise_labels': noise_labels,
                    'open_noise': self.open_noise,
                    'closed_noise': self.closed_noise
                }
                print("save noise to %s ..." % noise_file)
                json.dump(noise, open(noise_file, "w"))

            if self.mode == 'all':
                self.train_data = train_data
                if targets is None:
                    self.noise_labels = noise_labels
                else:
                    self.noise_labels = targets
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    self.probability = [probability[i] for i in pred_idx]

                    clean = (np.array(noise_labels) == np.array(train_label))
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability, clean)
                    # note: If all the labels are clean, the following will return NaN
                    auc, _, _ = auc_meter.value()

                elif self.mode == "unlabeled":
                    pred_idx = pred.nonzero()[0]

                self.train_data = train_data[pred_idx]
                self.noise_labels = [noise_labels[i] for i in pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_labels)))
예제 #5
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''):
        # mode
            # Test      : Test
            # All       : All
            # Labeled   : Labeled
            # UnLabeled : UnLabeled
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        self.transition = {0:0,2:0,4:4,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
     
        if self.mode=='test':
            if dataset=='cifar10':
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            # Noise Label 생성
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                # num_noise = int(self.r*50000) -> 순 사기꾼이여ㅡㅡ
                # noise_idx = idx[:num_noise]
                noise_idx = idx[:]

                num_classes = 10 if dataset == 'cifar10' else 100

                if noise_mode == 'sym':
                    C = uniform_mix_C(self.r, num_classes)
                    # if dataset=='cifar10':
                    #     noiselabel = random.randint(0,9)
                    # elif dataset=='cifar100':
                    #     noiselabel = random.randint(0,99)
                    # noise_label.append(noiselabel)
                elif noise_mode == 'asym':
                    C = flip_labels_C(self.r, num_classes)

                for i in range(50000):
                    if i in noise_idx:
                        noiselabel = np.random.choice(num_classes, p=C[train_label[i]])
                        noise_label.append(noiselabel)
                    else:    
                        noise_label.append(train_label[i])   
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       


            # 전체 부분
            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0] # 4770
                    self.probability = [probability[i] for i in pred_idx] # 4770
                    
                    clean = (np.array(noise_label)==np.array(train_label)) # 39981
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)
                    auc,_,_ = auc_meter.value()
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0] # 45230
                
                self.train_data = train_data[pred_idx]
                self.noise_label = [noise_label[i] for i in pred_idx]                          
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
예제 #6
0
    def __init__(self, dataset, r, noise_mode, root_dir, transform, mode, noise_file='', pred=[], probability=[], log=''): 
        
        self.r = r # noise ratio
        self.transform = transform
        self.mode = mode  
        #self.transition = {0:0,2:0,4:7,7:7,1:1,9:1,3:5,5:3,6:6,8:8} # class transition for asymmetric noise
        #self.transition = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 9: 9, 9: 0}  # 十类
        self.transition = {0: 1, 1: 0} # 两类

        if self.mode=='test':
            if dataset=='cifar10':                
                test_dic = unpickle('%s/test_batch'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['labels']
                if noise_mode == 'asym_two_unbalanced_classes':
                    for i in range(len(self.test_label)):
                        if self.test_label[i] != 1:
                            self.test_label[i] = 0
                #print("self.test_label=",self.test_label)
            elif dataset=='cifar100':
                test_dic = unpickle('%s/test'%root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))  
                self.test_label = test_dic['fine_labels']                            
        else:    
            train_data=[]
            train_label=[]
            if dataset=='cifar10': 
                for n in range(1,6):
                    dpath = '%s/data_batch_%d'%(root_dir,n)
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label+data_dic['labels']
                train_data = np.concatenate(train_data)
                if noise_mode == 'asym_two_unbalanced_classes':
                    for i in range(len(train_label)):
                        if train_label[i] != 1:
                            train_label[i] = 0
                #print("train_label=",train_label)
            elif dataset=='cifar100':    
                train_dic = unpickle('%s/train'%root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))

            #每次重新获得第一次初始化的数据
            if os.path.exists(noise_file):
                noise_label = json.load(open(noise_file,"r"))
            else:    #inject noise
                noise_label = []
                idx = list(range(50000))
                random.shuffle(idx)
                #num_noise = int(self.r*50000)
                if noise_mode == 'sym':
                    num_noise = int((self.r / 9) / ( 1-self.r +  self.r / 9 ) * 50000)
                else:
                    num_noise = int(self.r * 50000)
                noise_idx = idx[:num_noise]
                for i in range(50000):
                    if i in noise_idx:
                        if noise_mode=='sym':
                            if dataset=='cifar10':
                                    noiselabel = random.randint(0, 9)
                                    #print("noiselabel=",noiselabel)
                                    #print("train_label[i]=",train_label[i])
                                    while noiselabel == train_label[i]:
                                        noiselabel = random.randint(0,9)
                            elif dataset=='cifar100':    
                                noiselabel = random.randint(0,99)
                            noise_label.append(noiselabel)
                        elif noise_mode=='asym_two_unbalanced_classes':
                            noiselabel = self.transition[train_label[i]]
                            noise_label.append(noiselabel)                    
                    else:    
                        noise_label.append(train_label[i])   
                print("save noisy labels to %s ..."%noise_file)        
                json.dump(noise_label,open(noise_file,"w"))       
            
            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            else:                   
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]#二维数组,返回第几个samples,[0]表示第几行
                    self.probability = [probability[i] for i in pred_idx]   
                    
                    clean = (np.array(noise_label)==np.array(train_label))                                                       
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability,clean)        
                    auc,_,_ = auc_meter.value()               
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n'%(pred.sum(),auc))
                    log.flush()      
                    
                elif self.mode == "unlabeled":
                    pred_idx = (1-pred).nonzero()[0]                                               
                
                self.train_data = train_data[pred_idx]# 每次初始化的时候 这里面是有label的数据 或者 无label的数据
                self.noise_label = [noise_label[i] for i in pred_idx]   #每次初始化的时候 这里面是有label的数据 或者 无label的数据
                print("%s data has a size of %d"%(self.mode,len(self.noise_label)))            
예제 #7
0
class AllInOneMeter(object):
    """
    All in one meter: AUC
    """
    def __init__(self, device):
        self.device = device
        # super(AllInOneMeter, self).__init__()
        self.out1auc1 = AUCMeter()
        self.out1auc2 = AUCMeter()
        self.out1auc3 = AUCMeter()
        self.out1auc4 = AUCMeter()
        self.out1auc5 = AUCMeter()
        self.out2auc1 = AUCMeter()
        self.out2auc2 = AUCMeter()
        self.out2auc3 = AUCMeter()
        self.out2auc4 = AUCMeter()
        self.out2auc5 = AUCMeter()
        self.loss1 = []
        self.loss2 = []
        self.loss3 = []
        self.loss = []
        self.jaccard = []
        # self.nbatch = 0
        self.intersection = torch.zeros([5],
                                        dtype=torch.float,
                                        device=self.device)
        self.union = torch.zeros([5], dtype=torch.float, device=self.device)
        self.reset()

    def reset(self):
        # self.scores = torch.DoubleTensor(torch.DoubleStorage()).numpy()
        # self.targets = torch.LongTensor(torch.LongStorage()).numpy()
        self.out1auc1.reset()
        self.out1auc2.reset()
        self.out1auc3.reset()
        self.out1auc4.reset()
        self.out1auc5.reset()
        self.out2auc1.reset()
        self.out2auc2.reset()
        self.out2auc3.reset()
        self.out2auc4.reset()
        self.out2auc5.reset()
        self.loss1 = []
        self.loss2 = []
        self.loss3 = []
        self.loss = []
        self.jaccard = []
        self.intersection = torch.zeros([5],
                                        dtype=torch.float,
                                        device=self.device)
        self.union = torch.zeros([5], dtype=torch.float, device=self.device)
        # self.nbatch = 0

    def add(self, mask_prob, true_mask, mask_ind_prob1, mask_ind_prob2,
            true_mask_ind, loss1, loss2, loss3, loss):
        self.out1auc1.add(mask_ind_prob1[:, 0].data, true_mask_ind[:, 0].data)
        self.out1auc2.add(mask_ind_prob1[:, 1].data, true_mask_ind[:, 1].data)
        self.out1auc3.add(mask_ind_prob1[:, 2].data, true_mask_ind[:, 2].data)
        self.out1auc4.add(mask_ind_prob1[:, 3].data, true_mask_ind[:, 3].data)
        self.out1auc5.add(mask_ind_prob1[:, 4].data, true_mask_ind[:, 4].data)
        self.out2auc1.add(mask_ind_prob2[:, 0].data, true_mask_ind[:, 0].data)
        self.out2auc2.add(mask_ind_prob2[:, 1].data, true_mask_ind[:, 1].data)
        self.out2auc3.add(mask_ind_prob2[:, 2].data, true_mask_ind[:, 2].data)
        self.out2auc4.add(mask_ind_prob2[:, 3].data, true_mask_ind[:, 3].data)
        self.out2auc5.add(mask_ind_prob2[:, 4].data, true_mask_ind[:, 4].data)
        self.loss1.append(loss1)
        self.loss2.append(loss2)
        self.loss3.append(loss3)
        self.loss.append(loss)
        # self.nbatch += true_mask.shape[0]
        y_pred = (mask_prob > 0.3).type(true_mask.dtype)
        y_true = true_mask
        self.intersection += (y_pred *
                              y_true).sum(dim=-2).sum(dim=-1).sum(dim=0)
        self.union += y_true.sum(dim=-2).sum(dim=-1).sum(dim=0) + y_pred.sum(
            dim=-2).sum(dim=-1).sum(dim=0)

    def value(self):
        jaccard_array = (self.intersection / (self.union - self.intersection))
        # jaccard_array = jaccard_array.data.cpu().numpy()
        jaccard = jaccard_array.mean()
        metrics = {
            'out1auc1': self.out1auc1.value()[0],
            'out1auc2': self.out1auc2.value()[0],
            'out1auc3': self.out1auc3.value()[0],
            'out1auc4': self.out1auc4.value()[0],
            'out1auc5': self.out1auc5.value()[0],
            'out2auc1': self.out2auc1.value()[0],
            'out2auc2': self.out2auc2.value()[0],
            'out2auc3': self.out2auc3.value()[0],
            'out2auc4': self.out2auc4.value()[0],
            'out2auc5': self.out2auc5.value()[0],
            'loss1': np.mean(self.loss1),
            'loss2': np.mean(self.loss2),
            'loss3': np.mean(self.loss3),
            'loss': np.mean(self.loss),
            'jaccard': jaccard.item(),
            'jaccard1': jaccard_array[0].item(),
            'jaccard2': jaccard_array[1].item(),
            'jaccard3': jaccard_array[2].item(),
            'jaccard4': jaccard_array[3].item(),
            'jaccard5': jaccard_array[4].item(),
        }
        return metrics
예제 #8
0
    def __init__(self,
                 dataset,
                 r,
                 noise_mode,
                 root_dir,
                 transform,
                 mode,
                 noise_file='',
                 clean_file='',
                 pred=[],
                 probability=[],
                 log=''):

        self.r = r  # noise ratio
        self.transform = transform
        self.noise_mode = noise_mode
        self.mode = mode
        self.transition = {
            0: 0,
            2: 0,
            4: 7,
            7: 7,
            1: 1,
            9: 1,
            3: 5,
            5: 3,
            6: 6,
            8: 8
        }  # class transition for asymmetric noise

        if self.mode == 'test':
            if dataset == 'cifar10':
                test_dic = unpickle('%s/data/cifar-10-batches-py/test_batch' %
                                    root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['labels']
            elif dataset == 'cifar100':
                test_dic = unpickle('%s/data/cifar-100-python/test' % root_dir)
                self.test_data = test_dic['data']
                self.test_data = self.test_data.reshape((10000, 3, 32, 32))
                self.test_data = self.test_data.transpose((0, 2, 3, 1))
                self.test_label = test_dic['fine_labels']
        else:
            train_data = []
            train_label = []
            if dataset == 'cifar10':
                #print("current path is {}".format(sys.path[0]))
                for n in range(1, 6):
                    dpath = '%s/data/cifar-10-batches-py/data_batch_%d' % (
                        root_dir, n)
                    #print("path is {}".format(dpath))
                    data_dic = unpickle(dpath)
                    train_data.append(data_dic['data'])
                    train_label = train_label + data_dic['labels']
                train_data = np.concatenate(train_data)
            elif dataset == 'cifar100':
                train_dic = unpickle('%s/data/cifar-100-python/train' %
                                     root_dir)
                train_data = train_dic['data']
                train_label = train_dic['fine_labels']
            train_data = train_data.reshape((50000, 3, 32, 32))
            train_data = train_data.transpose((0, 2, 3, 1))
            train_label = np.array(train_label)
            noise_label = train_label.copy()
            if dataset == 'cifar10':
                nb_classes = 10
            elif dataset == 'cifar100':
                nb_classes = 100
            clean_per_class = int(5000 / nb_classes)  # cifar10: 100 else: 10
            noise_per_class = int(50000 / nb_classes * r)

            #select clean_per_class numbers of data in each class as clean data
            #leave the other data to add noise
            #the 0th data processing is at the outer loop
            #0th add noise (for index)
            all_index = np.arange(50000).reshape(-1)
            clean_indices = all_index[np.where(
                train_label == 0)[0]][-clean_per_class:]
            noise_idx = [
                all_index[np.where(train_label == 0)[0]][:-clean_per_class]
            ]
            #from 1th to 9th to add noise (for index)
            for i in range(nb_classes - 1):
                indices1 = all_index[np.where(train_label == i +
                                              1)[0]][-clean_per_class:]
                noisy_indices1 = all_index[np.where(train_label == i +
                                                    1)[0]][:-clean_per_class]
                clean_indices = np.concatenate((clean_indices, indices1))
                noise_idx.append(noisy_indices1)
            #add noise
            for t, i in enumerate(noise_idx):
                # randomly selected one image as the center
                image_center = train_data[i[10]]
                norm_loss = np.zeros(len(i))
                for j, k in enumerate(i):
                    images = train_data[k]
                    norm_loss[j] = np.linalg.norm(image_center - images)
                noisy_indices = i[norm_loss.argsort()[:noise_per_class]]
                noise_label[noisy_indices] = (t + 1) % nb_classes

            if self.mode == 'all':
                self.train_data = train_data
                self.noise_label = noise_label
            elif self.mode == 'small':
                self.train_data = train_data[::100]
                self.noise_label = noise_label[::100]
            else:
                if self.mode == "labeled":
                    pred_idx = pred.nonzero()[0]
                    self.probability = [probability[i] for i in pred_idx]

                    #clean = (np.array(noise_label)==np.array(train_label))
                    clean = (noise_label == train_label)
                    auc_meter = AUCMeter()
                    auc_meter.reset()
                    auc_meter.add(probability, clean)
                    auc, _, _ = auc_meter.value()
                    log.write('Numer of labeled samples:%d   AUC:%.3f\n' %
                              (pred.sum(), auc))
                    log.flush()

                elif self.mode == "unlabeled":
                    pred_idx = (1 - pred).nonzero()[0]

                self.train_data = train_data[pred_idx]
                self.noise_label = noise_label[pred_idx]
                print("%s data has a size of %d" %
                      (self.mode, len(self.noise_label)))