def act(self, observation: Any, reward: Any, done: bool, info: Optional[Dict[Any, Any]] = None) -> Any: obs = torch.from_numpy(observation.astype(np.float32)) forward = self.module.forward(obs) # type: ignore probas = F.softmax(forward, dim=0) if self.deterministic: return probas.max(0)[1].view(1, 1).item() else: return next(iter(WeightedRandomSampler(probas, 1)))
def build_balanced_sampler(labels, dataset_size=None): if dataset_size is None: dataset_size = len(labels) weights_per_class = [1/x for x in Counter(labels).values()] weights_per_example = [weights_per_class[c] for c in labels] return WeightedRandomSampler(weights_per_example, dataset_size, replacement=True)
def set_weights(self): """ sets the weights from the weighted dataset """ # Make train/val weights self.train_weights = self.data_weighter.weighting_function( self.prop_train) self.val_weights = self.data_weighter.weighting_function(self.prop_val) # Create samplers self.train_sampler = WeightedRandomSampler(self.train_weights, num_samples=len( self.train_weights), replacement=True) self.val_sampler = WeightedRandomSampler(self.val_weights, num_samples=len( self.val_weights), replacement=True)
def get_dataloaders(data_dir, patch_size: int, box_coder, train_batch_size=1, valid_batch_size=1, workers=4, fold=0, fast=False): train_ids, valid_ids = get_train_test_split_for_fold(fold, ships_only=True) if fast: train_ids = train_ids[:train_batch_size * 64] valid_ids = valid_ids[:valid_batch_size * 64] groundtruth = pd.read_csv( os.path.join(data_dir, 'train_ship_segmentations_v2.csv')) trainset = D.RSSDDataset(sample_ids=train_ids, data_dir=data_dir, transform=get_transform(training=True, width=patch_size, height=patch_size), groundtruth=groundtruth, box_coder=box_coder) validset = D.RSSDDataset(sample_ids=valid_ids, data_dir=data_dir, transform=get_transform(training=False, width=patch_size, height=patch_size), groundtruth=groundtruth, box_coder=box_coder) shuffle = True sampler = None if fast: shuffle = False sampler = WeightedRandomSampler(np.ones(len(trainset)), 1024) trainloader = DataLoader(trainset, batch_size=train_batch_size, num_workers=workers, pin_memory=True, drop_last=True, shuffle=shuffle, sampler=sampler) validloader = DataLoader( validset, batch_size=valid_batch_size, num_workers=workers, pin_memory=True, drop_last=False, shuffle=False, ) print('Train set', len(trainset), len(trainloader), 'Valid set', len(validset), len(validloader)) return trainloader, validloader
def load_data(): print("initializing dataloader") transforms_video = [ CenterCrop(opt.image_size), RGB2Lab(), ToTensor(), Normalize(), ] transforms_imagenet = [ CenterPad_threshold(opt.image_size), RGB2Lab(), ToTensor(), Normalize() ] extra_reference_transform = [ transform_lib.RandomHorizontalFlip(0.5), transform_lib.RandomResizedCrop(480, (0.98, 1.0), ratio=(0.8, 1.2)), ] train_dataset_video = VideosDataset( data_root=opt.data_root, epoch=opt.epoch, image_size=opt.image_size, image_transform=transforms.Compose(transforms_video), real_reference_probability=opt.real_reference_probability, nonzero_placeholder_probability=opt.nonzero_placeholder_probability, ) train_dataset_imagenet = VideosDataset_ImageNet( data_root=opt.data_root_imagenet, image_size=opt.image_size, epoch=opt.epoch, with_bad=opt.with_bad, with_mid=opt.with_mid, transforms_imagenet=transforms_imagenet, distortion_level=4, brightnessjitter=5, nonzero_placeholder_probability=opt.nonzero_placeholder_probability, extra_reference_transform=extra_reference_transform, real_reference_probability=opt.real_reference_probability, ) video_training_length = len(train_dataset_video) imagenet_training_length = len(train_dataset_imagenet) dataset_training_length = train_dataset_video.real_len + train_dataset_imagenet.real_len dataset_combined = ConcatDataset( [train_dataset_video, train_dataset_imagenet]) sampler = WeightedRandomSampler([1] * video_training_length + [1] * imagenet_training_length, dataset_training_length * opt.epoch) data_loader = DataLoader( dataset_combined, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers, pin_memory=True, drop_last=True, sampler=sampler, ) return dataset_training_length, train_dataset_video, train_dataset_imagenet, data_loader
def get_class_balanced_sampler( dataset ): assert isinstance(dataset, ClassDataset), 'dataset is an instance of ClassDataset.' indices = list(range(len(dataset))) num_samples = len(dataset) weights = [1.0 / dataset.per_label_records_num[dataset[index][3].item()] for index in indices] weights = torch.tensor(weights) return WeightedRandomSampler(weights, num_samples)
def my_sampler(target): class_sample_count = np.array([len(np.where(target == t)[0]) for t in np.unique(target)]) weight = 1. / class_sample_count samples_weight = np.array([weight[t] for t in target]) samples_weight = torch.from_numpy(samples_weight) samples_weigth = samples_weight.double() sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) return sampler
def create_weighted_sampler(labels): labels_unique, counts = np.unique(labels, return_counts=True) class_weights = [sum(counts) / c for c in counts] #class_weights[1] = class_weights[1]/2 example_weights = [class_weights[int(e)] for e in labels] #print("Example Weights:") #print(example_weights) sampler = WeightedRandomSampler(example_weights, len(labels)) return sampler
def prepare_random_sampler(classes_list): class_sample_count = np.array( [len(np.where(classes_list == t)[0]) for t in np.unique(classes_list)]) weight = 1. / class_sample_count samples_weight = np.array([weight[t] for t in classes_list]) samples_weight = torch.from_numpy(samples_weight) samples_weight = samples_weight.double() sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) return sampler
def get_iwildcam_loader(cfg, mode='train'): print("Mode: {}".format(mode)) if mode == 'train' or mode == 'train_val' or mode == 'train_dev': # train_val train_data = iWildCam(cfg, mode=mode) # 定义一个取数据的 迭代器 if cfg.TRAIN.WEIGHT_SAMPLER: train_sampler = WeightedRandomSampler(train_data.samples_weight, train_data.__len__()) train_loader = torch.utils.data.DataLoader( train_data, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.TRAIN.NUM_WORKER, drop_last=True, pin_memory=True, sampler=train_sampler) else: train_loader = torch.utils.data.DataLoader( train_data, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.TRAIN.NUM_WORKER, drop_last=True, pin_memory=True) dev_data = iWildCam(cfg, mode='dev') # eval的数据 dev_loader = torch.utils.data.DataLoader( dev_data, batch_size=cfg.TRAIN.EVAL_BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.NUM_WORKER, drop_last=False, pin_memory=True) return train_loader, dev_loader elif mode in ['infer', 'infer_by_seq', 'infer_by_seqv2']: test_data = iWildCam(cfg, mode=mode) test_loader = torch.utils.data.DataLoader( test_data, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.NUM_WORKER, drop_last=False, pin_memory=True) return test_loader elif mode == 'val': # 仅用于验证模型的性能 val_data = iWildCam(cfg, mode='dev') # eval的数据 val_loader = torch.utils.data.DataLoader( val_data, batch_size=cfg.TRAIN.EVAL_BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.NUM_WORKER, drop_last=False, pin_memory=True) return val_loader else: return None
def create_dataloaders(X_train, y_train, X_valid, y_valid, weights, batch_size, num_workers, drop_last): """ Create dataloaders with or wihtout subsampling depending on weights and balanced. Parameters ---------- X_train: np.ndarray Training data y_train: np.array Mapped Training targets X_valid: np.ndarray Validation data y_valid: np.array Mapped Validation targets weights : dictionnary or bool Weight for each mapped target class 0 for no sampling 1 for balanced sampling Returns ------- train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader Training and validation dataloaders """ if weights == 0: train_dataloader = DataLoader(TorchDataset(X_train, y_train), batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=drop_last) else: if weights == 1: class_sample_count = np.array( [len(np.where(y_train == t)[0]) for t in np.unique(y_train)]) weights = 1. / class_sample_count samples_weight = np.array([weights[t] for t in y_train]) samples_weight = torch.from_numpy(samples_weight) samples_weight = samples_weight.double() else: # custom weights samples_weight = np.array([weights[t] for t in y_train]) sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) train_dataloader = DataLoader(TorchDataset(X_train, y_train), batch_size=batch_size, sampler=sampler, num_workers=num_workers, drop_last=drop_last) valid_dataloader = DataLoader(TorchDataset(X_valid, y_valid), batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_dataloader, valid_dataloader
def train(self, net, samples, optimizer, e): alpha = 2 * max(0, ((100 - e) / 100)) criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha) transforms = generator.TransformationsGenerator([ random.RandomFlipLr(), random.RandomAffine(image_size=101, translation=lambda rs: (rs.randint(-20, 20), rs.randint(-20, 20)), scale=lambda rs: (rs.uniform(0.85, 1.15), 1), **utils.transformations_options), transformations.Padding(((13, 14), (13, 14), (0, 0))) ]) pseudo_dataset = datasets.SemiSupervisedImageDataset( samples_test, settings.test, transforms, size=len(samples_test), test_predictions=self.test_predictions, momentum=0.0) dataset = datasets.ImageDataset(samples, settings.train, transforms) weights = [len(pseudo_dataset) / len(dataset) * 2 ] * len(dataset) + [1] * len(pseudo_dataset) dataloader = DataLoader(ConcatDataset([dataset, pseudo_dataset]), num_workers=10, batch_size=16, sampler=WeightedRandomSampler( weights=weights, num_samples=3200)) average_meter_train = meters.AverageMeter() with tqdm(total=len(dataloader), leave=False) as pbar, torch.enable_grad(): net.train() for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_predictions = net(images) loss = criterion(masks_predictions, masks_targets) loss.backward() optimizer.step() optimizer.zero_grad() average_meter_train.add('loss', loss.item()) self.update_pbar(torch.sigmoid(masks_predictions), masks_targets, pbar, average_meter_train, 'Training epoch {}'.format(e)) train_stats = { 'train_' + k: v for k, v in average_meter_train.get_all().items() } return train_stats
def __init__(self): ##The top config #self.data_root = '/media/hhy/data/USdata/MergePhase1/test_0.3' #self.log_dir = '/media/hhy/data/code_results/MILs/MIL_H_Attention' self.root = '/remote-home/my/Ultrasound_CV/data/Ruijin/clean' self.log_dir = '/remote-home/my/hhy/Ultrasound_MIL/experiments/weighted_sampler/' if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) ##training config self.lr = 1e-4 self.epoch = 50 self.resume = -1 self.batch_size = 1 self.net = Attention() self.net.cuda() self.optimizer = Adam(self.net.parameters(), lr=self.lr) self.lrsch = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[10, 30, 50, 70], gamma=0.5) self.logger = Logger(self.log_dir) self.train_transform = transforms.Compose([ transforms.Resize((224,224)), transforms.RandomResizedCrop((224,224)), transforms.RandomHorizontalFlip(0.5), transforms.RandomVerticalFlip(0.5), transforms.ColorJitter(0.25,0.25,0.25,0.25), transforms.ToTensor() ]) self.test_transform = transforms.Compose([ transforms.Resize((224,224)), transforms.ToTensor() ]) self.trainbag = RuijinBags(self.root, [0,1,2,3],self.train_transform) self.testbag = RuijinBags(self.root, [4], self.test_transform) train_label_list = list(map(lambda x: int(x['label']), self.trainbag.patient_info)) pos_ratio = sum(train_label_list) / len(train_label_list) print(pos_ratio) train_weight = [(1-pos_ratio) if x>0 else pos_ratio for x in train_label_list] self.train_sampler = WeightedRandomSampler(weights=train_weight, num_samples=len(self.trainbag)) self.train_loader = DataLoader(self.trainbag, batch_size=self.batch_size, num_workers=8, sampler=self.train_sampler) self.val_loader = DataLoader(self.testbag, batch_size=self.batch_size, shuffle=False, num_workers=8) if self.resume > 0: self.net, self.optimizer, self.lrsch, self.loss, self.global_step = self.logger.load(self.net, self.optimizer, self.lrsch, self.loss, self.resume) else: self.global_step = 0 # self.trainer = MTTrainer(self.net, self.optimizer, self.lrsch, self.loss, self.train_loader, self.val_loader, self.logger, self.global_step, mode=2) self.trainer = MILTrainer_batch1(self.net, self.optimizer, self.lrsch, None, self.train_loader, self.val_loader, self.logger, self.global_step)
def prepare_data(): #2.1 get files and split for K-fold dataset #2.1.1 read files train_ = get_files(config.train_data, "train") #val_data_list = get_files(config.val_data,"val") test_files = get_files(config.test_data, "test") """ #2.1.2 split split_fold = StratifiedKFold(n_splits=3) folds_indexes = split_fold.split(X=origin_files["filename"],y=origin_files["label"]) folds_indexes = np.array(list(folds_indexes)) fold_index = folds_indexes[fold] #2.1.3 using fold index to split for train data and val data train_data_list = pd.concat([origin_files["filename"][fold_index[0]],origin_files["label"][fold_index[0]]],axis=1) val_data_list = pd.concat([origin_files["filename"][fold_index[1]],origin_files["label"][fold_index[1]]],axis=1) """ train_data_list, val_data_list = train_test_split(train_, test_size=0.15, stratify=train_["label"]) #2.1.4 load dataset #2.1.4.1 sampling train_dataset, val_dataset, test_dataset = AgriDataset( train_data_list), AgriDataset(val_data_list, train=False), AgriDataset(test_files, test=True) if config.ifWeightedRandomShuffle is True: distribution = train_data_list.groupby(by=['label']).size() banlance = min(distribution) dic = {} for i in range(len(distribution)): dic[i] = distribution[i] weights = [] for i in range(len(train_dataset)): weights.append(banlance / dic[train_dataset[i][1]]) sampler = WeightedRandomSampler(weights,\ num_samples=len(weights),\ replacement=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size,\ collate_fn=collate_fn,pin_memory=True,sampler=sampler) else: train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=False) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=False) return train_dataloader, val_dataloader, test_dataloader
def sampler(self, mode: str, dataset: Dataset) -> Sampler: if "train" in mode: if self.config.DEBUG: return WeightedRandomSampler( weights=np.ones(len(dataset)), num_samples=self.config.DEBUG_TRAIN_SIZE) else: return RandomSampler(dataset, replacement=False) else: return SequentialSampler(dataset)
def create_data_loader(dataset, counters, parameters, init_sampler): labels = [label for _, label in dataset.the_list] class_weights = [dataset.__len__() / counters[label] for label in label_names().values()] weights = [class_weights[labels[i]] for i in range(dataset.__len__())] if init_sampler: sampler = WeightedRandomSampler(weights=weights, num_samples=dataset.__len__()) data_loader = DataLoader(dataset=dataset, batch_size=parameters['batch_size'], sampler=sampler) else: data_loader = DataLoader(dataset=dataset, batch_size=parameters['batch_size']) return data_loader
def resampling_balance(data): targets = data.target class_count = np.unique(targets, return_counts=True)[1] print("Class number before resampling: ", class_count) weight = 1. / class_count samples_weight = weight[targets] samples_weight = torch.from_numpy(samples_weight) sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) return sampler
def forward(self, bsz, seq_len, cuda=True): # returns bsz*seq_len*nsamples samples in shape nsamples x (bsz x seq_len) # sample based on frequencies wrs = WeightedRandomSampler(self.frequencies, self.nsamples * bsz * seq_len) samples = torch.LongTensor( list(wrs)).cuda() if cuda else torch.LongTensor(list(wrs)) return samples.view(-1, bsz)
def __init__(self, danspeech_multi_dataset, num_replicas=None, rank=None): super(DistributedWeightedSamplerCustom, self).__init__(danspeech_multi_dataset, num_replicas=num_replicas, rank=rank, shuffle=False) self.sampler = WeightedRandomSampler( danspeech_multi_dataset.final_weights, len(danspeech_multi_dataset)) self.epoch = 0
def get(self): """Obtains an instance of the sampler. """ gen = torch.Generator() gen.manual_seed(self.random_seed) # Samples without replacement using the sample weights return WeightedRandomSampler(weights=self.sample_weights, num_samples=self.partition_size, replacement=False, generator=gen)
def _create_sampler(self, target_np): self.labels = np.unique(target_np) class_sample_count = np.array( [len(np.where(target_np == t)[0]) for t in self.labels]) weight = 1. / class_sample_count samples_weight = torch.from_numpy( np.array([weight[t] for t in target_np])).double() return WeightedRandomSampler(samples_weight, len(samples_weight), replacement=True)
def _construct_dataloaders(self): dataloaders = [] for dataset, weights in zip(self._datasets, self._weights): sampler = WeightedRandomSampler(weights, len(weights)) dataloaders.append( DataLoader(dataset=dataset, sampler=sampler, num_workers=self._num_workers, batch_size=self._batch_size)) return [iter(loader) for loader in dataloaders]
def _get_balanced_dev_dataloader(self, dataset, drop_last=False): return DataLoader( dataset, sampler=WeightedRandomSampler(dataset.sample_weights, len(dataset.sample_weights)), batch_size=self.datarc["batch_size"], drop_last=drop_last, num_workers=self.datarc["num_workers"], collate_fn=dataset.collate_fn, )
def balance_sources_sampler(dataset, strength): srcs = dataset.sources sources_count = [list(srcs.values()).count(source) for source in set(list(srcs.values()))] sources_count_dict = dict(zip(list(set(list(srcs.values()))), sources_count)) weights = [1/(sources_count_dict[srcs[image_id]] + 1.0 / strength) for image_id in list(srcs.keys())] sampler = WeightedRandomSampler(weights, len(dataset)) return sampler
def _get_sampler(df: pd.DataFrame, alpha: float = 0.5) -> Sampler: y = np.array( [HumanProteinDataset.parse_target(target) for target in df.Target]) class_weights = np.round(np.log(alpha * y.sum() / y.sum(axis=0)), 2) class_weights[class_weights < 1.0] = 1.0 weights = np.zeros(len(df)) for i, target in enumerate(y): weights[i] = class_weights[target == 1].max() return WeightedRandomSampler(weights, len(df))
def k_fold(logger,k,root,val_root,epoch,args,critertion,optimizer,k_model,device,loss_meter,confusion_matrix,train_acc,loss_list,val_acc): trained_time = 0 best_accracy = 0 best_model = None avg_accuracy = 0 avg_loss = 0 avg_train_acc = 0 end = time.time() for i in range(k): get_k_fold_data(k,i,'dataset/all_shuffle_datas.txt') train_transform = train_augment(cfg.IMAGE_SIZE) train_data=Eye(img_root=root,tag_root='dataset/train_k.txt',transform=train_transform) data_len=train_data.__len__() weight_prob=[data_len/w for w in [1,6,1,1,0.4,0.8,2.5]] weight_list=[weight_prob[label] for data,label in train_data] train_sampler = WeightedRandomSampler(weights=weight_list,num_samples=7*2000,replacement=True) train_dataloader=DataLoader(train_data,batch_size=cfg.BATCHSIZE,shuffle=(train_sampler==None),drop_last=True,sampler=train_sampler,num_workers=8) val_transform = val_augment(cfg.IMAGE_SIZE) val_data=Eye(img_root=root,tag_root='dataset/val_k.txt',transform=val_transform) val_dataloader=DataLoader(val_data,batch_size=cfg.BATCHSIZE,shuffle=False,drop_last=True,num_workers=8) k_model[i],train_loss,train_accuracy = train(train_dataloader,critertion,optimizer,k_model[i],device,loss_meter,confusion_matrix) val_cm,val_accuracy = val(k_model[i],val_dataloader,device) avg_accuracy+=val_accuracy avg_train_acc+=train_accuracy avg_loss+=train_loss trained_time = time.time() - end end = time.time() log_str = [ "Epoch:{:02d}, Fold:{:02d}, Lr:{:.8f}, Cost:{:.2f}s".format(epoch,i, optimizer.param_groups[0]['lr'], trained_time), "Loss:{:.2f}".format(train_loss), "train_acc:{:.2f}".format(train_accuracy), "val_acc:{:.2f}".format(val_accuracy) ] logger.info(log_str) if val_accuracy>best_accracy: best_model = k_model[i] avg_accuracy = avg_accuracy/k avg_train_acc = avg_train_acc/k avg_loss = avg_loss/k val_acc.append(avg_accuracy) train_acc.append(avg_train_acc) loss_list.append(avg_loss) log_str = "Epoch:{:2d}".format(epoch)+"--"+"avg_loss:{:2f}".format(avg_loss)+"--"+"avg_train_accuracy:{:2f}".format(avg_train_acc)+"--"+"avg_val_accuracy:{:2f}".format(avg_accuracy) logger.info(log_str) t.save(best_model.state_dict(),os.path.join(cfg.OUTPUT_MODEL_DIR, args.model+'test{:2d}.pth'.format(epoch))) return k_model
def get_class_balanced_sampler(dataset): if not hasattr(dataset, 'per_label_records_num'): return RandomSampler(dataset) indices = list(range(len(dataset))) num_samples = len(dataset) weights = [ 1.0 / dataset.per_label_records_num[dataset[index][3].item()] for index in indices ] weights = torch.tensor(weights) return WeightedRandomSampler(weights, num_samples)
def train_dataloader(self) -> DataLoader: sourceSet = self.dataSets['source'] targetSet = self.dataSets['targetTrain'] STSet = ConcatDataset([sourceSet, targetSet]) source_weights = [1.0 / len(sourceSet) for _ in range(len(sourceSet))] target_weights = [1.0 / len(targetSet) for _ in range(len(targetSet))] weights = [*source_weights, *target_weights] sampler = WeightedRandomSampler(weights=weights, num_samples=len(STSet), replacement=True) return DataLoader(STSet, sampler=sampler, batch_size=self.batch_size, num_workers=self.num_workers)
def class_imbalance_sampler(targets, segmentation_threshold): if len(targets.shape) > 1: # if posed as segmentation task targets = targets.sum(axis=1) / targets.shape[1] targets = targets > segmentation_threshold targets = tensor(targets).long().squeeze() class_count = torch.bincount(targets) weighting = tensor(1.) / class_count.float() weights = weighting[targets] sampler = WeightedRandomSampler(weights, len(targets)) return sampler
def create_sampler(self): class_weights = self.train_data.train['class'].value_counts().to_dict() for k, v in class_weights.items(): class_weights[k] = 1. / torch.tensor(v, dtype=torch.float) sample_weights = [0] * len(self.train_data.train) for idx, label in enumerate(self.train_data.train['class']): sample_weights[idx] = class_weights[label] return WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)