示例#1
0
def generate_sampler(dataset, sampler_option='random'):
    """
    Returns sampler according to the wanted options

    :param dataset: (MRIDataset) the dataset to sample from
    :param sampler_option: (str) choice of sampler
    :return: (Sampler)
    """
    df = dataset.df
    # To be changed for non-binary classification
    count = np.zeros(2)

    for idx in df.index:
        label = df.loc[idx, "diagnosis"]
        key = dataset.diagnosis_code[label]
        count[key] += 1

    weight_per_class = 1 / np.array(count)
    weights = []

    for idx, label in enumerate(df["diagnosis"].values):
        key = dataset.diagnosis_code[label]
        weights += [weight_per_class[key]] * dataset.elem_per_image

    if sampler_option == 'random':
        return sampler.RandomSampler(weights)
    elif sampler_option == 'weighted':
        return sampler.WeightedRandomSampler(weights, len(weights))
    else:
        raise NotImplementedError(
            f"The option {sampler_option} for sampler is not implemented")
示例#2
0
 def reset(self):
     """
     Two cases:
     1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator
     2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already.
     """
     # batch_size is 0, the merge is done in DataLoader class
     #print('cpu count: %d'%(multiprocessing.cpu_count()))
     if self.split == 'train':
         split_sample_weights = self.dataloader.sample_weights[np.array(
             self.dataloader.split_ix[self.split])]
         mysample_ = sampler.WeightedRandomSampler(
             split_sample_weights,
             len(self.dataloader.split_ix[self.split]))
         mysample = np.array(self.dataloader.split_ix[self.split])[np.array(
             list(mysample_))]
         self.split_ix = mysample.tolist()
     else:
         self.split_ix = self.dataloader.split_ix[self.split]
     #np.save('data/tmp/%s_ix.npy'%(self.split), np.array(self.split_ix))
     self.split_loader = iter(
         data.DataLoader(
             dataset=self.dataloader,
             batch_size=self.dataloader.batch_size,
             sampler=self.split_ix[self.dataloader.iterators[self.split]:],
             shuffle=False,
             pin_memory=False,
             num_workers=8,
             collate_fn=lambda x: x))
示例#3
0
    def generate_sampler(dataset, sampler_option="random", n_bins=5):
        df = dataset.df

        count = np.zeros(n_bins)
        values = df[dataset.label].values.astype(float)

        thresholds = [
            min(values) + i * (max(values) - min(values)) / n_bins
            for i in range(n_bins)
        ]
        for idx in df.index:
            label = df.loc[idx, dataset.label]
            key = max(np.where((label >= thresholds))[0])
            count[key] += 1

        weight_per_class = 1 / np.array(count)
        weights = []

        for idx, label in enumerate(df[dataset.label].values):
            key = max(np.where((label >= thresholds))[0])
            weights += [weight_per_class[key]] * dataset.elem_per_image

        if sampler_option == "random":
            return sampler.RandomSampler(weights)
        elif sampler_option == "weighted":
            return sampler.WeightedRandomSampler(weights, len(weights))
        else:
            raise NotImplementedError(
                f"The option {sampler_option} for sampler on regression task is not implemented"
            )
def get_dataloader_train(args, root, image_list):
    kwargs = {'num_workers': args.num_workers, 'pin_memory': args.pin_memory}

    dataset = ds.ClassificationDataset(root,
                                    image_list,
                                    transform=transforms.Compose([
                                    transforms.RandomRotation(3),
                                    transforms.RandomResizedCrop(224, scale=(0.74, 0.78), ratio=(1.0, 1.0)),
                                    transforms.RandomHorizontalFlip(),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
                                )

    prob = np.zeros(args.num_classes)
    for i in range(len(dataset)):
        cur_class = dataset.labels[i]
        prob[cur_class]+=1
    prob = 1.0 / prob

    reciprocal_weights = np.zeros(len(dataset))
    epoch_length = args.epoch_length
    for i in range(len(dataset)):
        label = dataset.labels[i]
        reciprocal_weights[i] = prob[label]
    weights  = torch.from_numpy(reciprocal_weights)

    weighted_sampler = sampler.WeightedRandomSampler(weights , epoch_length)
    loader = DataLoader(dataset, batch_size=args.batch_size,
                        sampler=weighted_sampler, **kwargs)

    return loader
示例#5
0
    def generate_sampler(dataset, sampler_option="random", n_bins=5):
        df = dataset.df
        n_labels = df[dataset.label].nunique()
        count = np.zeros(n_labels)

        for idx in df.index:
            label = df.loc[idx, dataset.label]
            key = dataset.label_fn(label)
            count[key] += 1

        weight_per_class = 1 / np.array(count)
        weights = []

        for idx, label in enumerate(df[dataset.label].values):
            key = dataset.label_fn(label)
            weights += [weight_per_class[key]] * dataset.elem_per_image

        if sampler_option == "random":
            return sampler.RandomSampler(weights)
        elif sampler_option == "weighted":
            return sampler.WeightedRandomSampler(weights, len(weights))
        else:
            raise NotImplementedError(
                f"The option {sampler_option} for sampler on classification task is not implemented"
            )
示例#6
0
def get_train_loaders(path, device, batch_size, workers, class_count):
    def to_device(x, y):
        return x.to(device), y.to(device, dtype=torch.int64)

    train_dataset = extend_dataset(
        PickledDataset(path + '/train_gray.p',
                       transform=get_train_transforms()))
    valid_dataset = PickledDataset(path + '/valid_gray.p',
                                   transform=get_test_transforms())

    # Use weighted sampler
    class_sample_count = np.bincount(train_dataset.labels)
    weights = 1 / np.array(
        [class_sample_count[y] for y in train_dataset.labels])
    samp = sampler.WeightedRandomSampler(weights, 43 * class_count)
    train_loader = WrappedDataLoader(
        DataLoader(train_dataset,
                   batch_size=batch_size,
                   sampler=samp,
                   num_workers=workers), to_device)
    valid_loader = WrappedDataLoader(
        DataLoader(valid_dataset,
                   batch_size=batch_size,
                   shuffle=False,
                   num_workers=workers), to_device)

    return train_loader, valid_loader
示例#7
0
    def make_loader(self, batch_size=16, num_workers=0, shuffle=False,
                    pin_memory=False):
        """
        Example:
            >>> torch.random.manual_seed(0)
            >>> dset = coco_api.CocoDataset(coco_api.demo_coco_data())
            >>> self = YoloCocoDataset(dset, train=1)
            >>> loader = self.make_loader(batch_size=1)
            >>> train_iter = iter(loader)
            >>> # training batches should have multiple shapes
            >>> shapes = set()
            >>> for batch in train_iter:
            >>>     shapes.add(batch[0].shape[-1])
            >>>     if len(shapes) > 1:
            >>>         break
            >>> #assert len(shapes) > 1

            >>> vali_loader = iter(loaders['vali'])
            >>> vali_iter = iter(loaders['vali'])
            >>> # vali batches should have one shape
            >>> shapes = set()
            >>> for batch, _ in zip(vali_iter, [1, 2, 3, 4]):
            >>>     shapes.add(batch[0].shape[-1])
            >>> assert len(shapes) == 1
        """
        assert len(self) > 0, 'must have some data'
        if shuffle:
            if True:
                # If the data is not balanced we need to balance it
                index_to_weight = self._training_sample_weights()
                num_samples = len(self)
                index_to_weight = index_to_weight[:num_samples]
                sampler = torch_sampler.WeightedRandomSampler(index_to_weight,
                                                              num_samples,
                                                              replacement=True)
                sampler.data_source = self  # hack for use with multiscale
            else:
                sampler = torch_sampler.RandomSampler(self)
            resample_freq = 10
        else:
            sampler = torch_sampler.SequentialSampler(self)
            resample_freq = None

        # use custom sampler that does multiscale training
        batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler(
            sampler, batch_size=batch_size, resample_freq=resample_freq,
        )
        # torch.utils.data.sampler.WeightedRandomSampler
        loader = torch_data.DataLoader(self, batch_sampler=batch_sampler,
                                       collate_fn=nh.data.collate.padded_collate,
                                       num_workers=num_workers,
                                       pin_memory=pin_memory)
        if loader.batch_size != batch_size:
            try:
                loader.batch_size = batch_size
            except Exception:
                pass
        return loader
def sampler_c(dataset):
    from torch.utils.data import sampler
    train_targets = dataset.classes_for_all_imgs()
    # 配置类别采样比例
    weights1 = torch.tensor([100, 0.01], dtype=torch.float)
    weights2 = torch.tensor([0.01, 100], dtype=torch.float)
    print(weights2)
    # 为每一个样本采得 采样权重
    samples_weights1 = weights1[train_targets]
    samples_weights2 = weights2[train_targets]
    # 采样器 replacement=True-放回采样
    sampler1 = sampler.WeightedRandomSampler(weights=samples_weights1,
                                             num_samples=3 * len(dataset),
                                             replacement=True)
    sampler2 = sampler.WeightedRandomSampler(weights=samples_weights2,
                                             num_samples=len(dataset),
                                             replacement=False)
    return sampler1, sampler2
示例#9
0
def balanced_sampler(train_labels):
    '''create a training dataloader for unbalanced class counts'''

    # For an unbalanced dataset create a weighted sampler
    class_counts, train_samples_weights = make_weights_for_balanced_classes(
        train_labels)

    # Make a sampler to undersample classes with the highest counts
    return sampler.WeightedRandomSampler(train_samples_weights,
                                         len(train_samples_weights),
                                         replacement=True)
def sampler_c(dataset):
    from torch.utils.data import sampler
    train_targets = dataset.classes_for_all_imgs()
    # 配置类别采样比例 4:1
    class_sample_couts = [4, 1]
    weights = torch.tensor(class_sample_couts, dtype=torch.float)
    # 为每一个样本采得 采样权重
    samples_weights = weights[train_targets]
    # 采样器 replacement=True-放回采样
    sampler = sampler.WeightedRandomSampler(weights=samples_weights,
                                            num_samples=len(dataset),
                                            replacement=True)
    return sampler
示例#11
0
def uniform_sampler(dataset):
    nclasses = len(dataset.imgs)

    imgs_per_class = [0] * len(dataset.classes)
    for _, cl in dataset.imgs:
        imgs_per_class[cl] += 1

    weight_per_img = [0] * nclasses
    for idx, (_, cl) in enumerate(dataset.imgs):
        weight_per_img[idx] = nclasses / imgs_per_class[cl]

    return sampler.WeightedRandomSampler(torch.DoubleTensor(weight_per_img),
                                         nclasses)
示例#12
0
    def sample_inference(self, token_seqs):
        softmax = nn.Softmax()
        batch_size = len(token_seqs)
        seq_len = len(token_seqs[0])
        encoder_hidden = Variable(self.encoder.initHidden(batch_size)).double()
        encoder_output = Variable(self.encoder.initHidden(batch_size)).double()
        if use_cuda:
            encoder_hidden = encoder_hidden.cuda()
            encoder_output = encoder_output.cuda()

        hidden = (encoder_output, encoder_hidden)
        for i in np.arange(seq_len - 1, 0, -1):
            token_batch = np.array(self.embeddings[token_seqs[:, i]])
            encoder_input = Variable(torch.from_numpy(token_batch)).view(
                1, batch_size, -1).double()
            encoder_input = encoder_input.cuda() if use_cuda else encoder_input
            hidden = self.encoder(encoder_input, hidden)

        encoder_output, encoder_hidden = hidden

        token_batch = np.array(self.embeddings[[SOS_TOKEN] * batch_size])
        decoder_output = Variable(self.decoder.initHidden(batch_size)).double()
        if use_cuda:
            decoder_output = decoder_output.cuda()

        hidden = (decoder_output, encoder_hidden)

        pred_seqs = None
        for i in range(250):
            decoder_input = Variable(torch.from_numpy(token_batch)).double()
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            decoder_input = decoder_input.squeeze().view(1, batch_size, -1)
            decoder_output, hidden = self.decoder(decoder_input, hidden)

            output = softmax(decoder_output).data.cpu()
            output = output.numpy()
            output = output.tolist()

            out_iter = sampler.WeightedRandomSampler(output, len(output))

            for idx in out_iter:
                ni = np.array([[idx]])
                break

            if pred_seqs is None:
                pred_seqs = ni
            else:
                pred_seqs = np.concatenate((pred_seqs, ni), axis=1)
            token_batch = np.array(self.embeddings[ni])

        return pred_seqs.tolist()
示例#13
0
    def train(self, path, verbose=True):
        self.data = SplitData(path, self.index)
        self.index = self.data.index_dict  # new index from non-pretrained version

        train_set = self.data('train', random_crop_size=self.random_crop_size)
        if self.sample_amount:
            self.sampler = sampler.WeightedRandomSampler(torch.ones(len(train_set)), self.sample_amount)
        self.train_loader = DataLoader(train_set, batch_size=self.batchsize[0], sampler=self.sampler,
                                       num_workers=self.numloader)
        self.val_loader = DataLoader(self.data('val'), batch_size=self.batchsize[1], num_workers=self.numloader)

        start_time = time.time()
        for epoch in range(self.epochs):
            self.current_epoch = epoch
            epoch_time = time.time()

            train_loss = self.model_iter(self.train_loader)
            self.output_list = torch.empty([0]).to(device)  # reset
            val_loss = self.model_iter(self.val_loader)

            epoch_time = time.time() - epoch_time

            # codes below are for printing
            if (epoch + 1) % 1 == 0 and verbose:
                print("Epoch {} finished, current validation loss is {:1.5f}, current train loss is {:1.5f}".format(
                    epoch + 1,
                    val_loss, train_loss, timeformat(epoch_time)))
                print('It takes {} from beginning'.format(timeformat(time.time() - start_time)))

            # codes below are for sustainability
            if val_loss < self.minimum_val_loss:
                self.minimum_val_loss = val_loss
                save_model(self.model, epoch + 1, train_loss, val_loss, self.index,
                           self.optimizer.param_groups[0]['lr'], time.strftime('%dd-%Hh-%Mm', time.localtime(start_time)))

            # codes below are for visualization
            loss_dict = {'{}_loss'.format(self.model_name): {'train': train_loss, 'validation': val_loss}}
            visualize(loss_dict, epoch, mode='scalar_dict')

            img_dict = {'{} results'.format(self.model_name): self.output_list}

            if self.model_name == 'all':
                nrow = 10
            elif self.model_name == 'spn':
                nrow = 8
            else:
                nrow = 3
            visualize(img_dict, epoch, mode='image', nrow=nrow)

        self.test_unknown('./test')
示例#14
0
文件: data.py 项目: SEU-DongHan/BDNet
def test_wegith_sampler():
    from torch.utils.data import sampler
    weight = list([
        1,
    ] * 30)
    weight[:10] = list([
        3,
    ] * 10)
    weight_sampler = sampler.WeightedRandomSampler(weight,
                                                   num_samples=len(weight))
    batch_sampler = sampler.BatchSampler(weight_sampler,
                                         batch_size=4,
                                         drop_last=False)
    for indices in batch_sampler:
        print(indices)
示例#15
0
 def init_dataloader(self, dset):
     class_sample_count = [len(c) for c in dset.classes]
     weights = 1 / torch.Tensor(class_sample_count)
     weights[~((weights + 1) != weights)] = 0
     weight_per_sample = [0] * len(dset)
     for i in range(len(dset)):
         c, cind = dset.index_to_sample_d[i]
         weight_per_sample[i] = weights[c]
     self.trainsampler = sampler.WeightedRandomSampler(
         weight_per_sample, len(dset))
     self.trainloader = DataLoader(dset,
                                   batch_size=self.opt.bSz,
                                   pin_memory=True,
                                   num_workers=self.opt.n_workers,
                                   sampler=self.trainsampler,
                                   drop_last=True)
示例#16
0
def generate_sampler(dataset, sampler_option='random', step=1):
    """
    Returns sampler according to the wanted options

    :param dataset: (MRIDataset) the dataset to sample from
    :param sampler_option: (str) choice of sampler
    :param step: (int) step to discretize ages and give a weight per class
    :return: (Sampler)
    """

    df = dataset.df
    min_age = np.min(df.age)
    max_age = np.max(df.age)

    if (max_age - min_age) % step == 0:
        max_age += step

    bins = np.arange(min_age, max_age, step)
    count = np.zeros(len(bins))
    for idx in df.index:
        age = df.loc[idx, "age"]
        key = np.argmax(np.logical_and(age - step < bins,
                                       age >= bins)).astype(int)
        count[key] += 1

    # weight_per_class = (1 / np.array(count)) if count.any() != 0 else 0.
    weight_per_class = np.zeros_like(count).astype(float)
    np.divide(1., count, out=weight_per_class, where=count != 0)
    weights = [0] * len(df)

    for idx, age in enumerate(df.age.values):
        key = np.argmax(np.logical_and(age - 5 <= bins,
                                       age > bins)).astype(int)
        weights[idx] = weight_per_class[key]

    weights = torch.FloatTensor(weights)

    if sampler_option == 'random':
        s = sampler.RandomSampler(dataset, replacement=False)
    elif sampler_option == 'weighted':
        s = sampler.WeightedRandomSampler(weights, len(weights))
    else:
        raise NotImplementedError(
            "The option %s for sampler is not implemented" % sampler_option)

    return s
示例#17
0
def get_dataloader(dataset,
                   balance_data,
                   batch_size,
                   num_workers,
                   shuffle=True):
    if balance_data:
        weights = dataset.get_data_weights(balance_data)
        sampler_ = sampler.WeightedRandomSampler(weights, len(weights))
    elif shuffle:
        sampler_ = sampler.RandomSampler(dataset)
    else:
        sampler_ = None
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             sampler=sampler_,
                                             num_workers=num_workers)

    return dataloader
示例#18
0
def ClassificationDatasetTest():
    root = '/media/shehabk/E_DRIVE/processed_db/expw/cropped_images/cropped_alligned_orig_256'
    image_list = '/media/shehabk/E_DRIVE/processed_db/expw/partition/seven_neutral_alligned_orig/train.txt'
    dataset = ClassificationDataset(
        root,
        image_list,
        transform=transforms.Compose([
            transforms.RandomRotation(3),
            transforms.Resize((118, 100)),
            transforms.RandomCrop((112, 96)),
            # transforms.RandomResizedCrop(224, scale=(0.74, 0.78),
            #                              ratio=(1.0, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]))

    # Balancing the classes !!!!!!
    prob = np.zeros(3)
    for i in range(len(dataset)):
        cur_class = dataset[i][1]
        prob[cur_class] += 1
    prob = 1.0 / prob

    reciprocal_weights = np.zeros(len(dataset))
    epoch_length = 2000
    for i in range(len(dataset)):
        _, label = dataset[i]
        reciprocal_weights[i] = prob[label]
    weights = torch.from_numpy(reciprocal_weights)

    weighted_sampler = sampler.WeightedRandomSampler(weights, epoch_length)
    loader = DataLoader(dataset, batch_size=10, sampler=weighted_sampler)

    # loader = DataLoader(dataset, batch_size=5, shuffle=True)
    for i_batch, data in enumerate(loader):
        images, labels = data
        print(labels)
        out = utils.make_grid(images, nrow=1)
        imshow(out)

        # vs.imshow(out , mean = [0.524462] , std = [0.285962])
        # vs.imshow(out, mean=[0.524462, 0.524462, 0.524462], std=[0.285962, 0.285962, 0.285962])
        break
示例#19
0
 def train_dataloader(self):
     if self.hparams.weight_sample:
         spl = sampler.WeightedRandomSampler(
             self.train_ds.target_weights, int(len(self.train_ds) / self.hparams.epoch_cut), replacement=True
         )
         return DataLoader(
             self.train_ds,
             batch_size=self.hparams.batch_size,
             sampler=spl,
             num_workers=self.hparams.num_workers,
             pin_memory=True,
         )
     else:
         return DataLoader(
             self.train_ds,
             batch_size=self.hparams.batch_size,
             shuffle=true,
             num_workers=self.hparams.num_workers,
             pin_memory=True,
         )
示例#20
0
def load_RM_data(path,
                 batch_size,
                 length,
                 use_embedding,
                 balanced_sampler=False):

    train = RMdata(path,
                   use_embedding=use_embedding,
                   length=length,
                   mode='train')

    valid = RMdata(path,
                   use_embedding=use_embedding,
                   length=length,
                   mode='valid')

    if not balanced_sampler:
        train_loader = DataLoader(dataset=train,
                                  batch_size=batch_size,
                                  shuffle=True)
    else:
        weights_train = make_weights_for_balanced_classes(train)
        # weights_valid = make_weights_for_balanced_classes(valid)

        weights_train = torch.cuda.DoubleTensor(weights_train)
        # weights_valid = torch.cuda.DoubleTensor(weights_valid)

        sampler_train = sampler.WeightedRandomSampler(weights_train,
                                                      len(weights_train))
        # sampler_valid = sampler.WeightedRandomSampler(weights_valid, len(weights_valid))

        train_loader = DataLoader(dataset=train,
                                  batch_size=batch_size,
                                  sampler=sampler_train)
        # valid_loader = DataLoader(dataset=valid,batch_size=batch_size,sampler=sampler_valid)

    valid_loader = DataLoader(dataset=valid,
                              batch_size=batch_size,
                              shuffle=True)

    return train_loader, valid_loader
示例#21
0
def dataloader(x_mal, x_ben):
    y_mal = np.ones(x_mal.shape[0])
    y_ben = np.zeros(x_ben.shape[0])
    x = np.concatenate([x_mal, x_ben])
    y = np.concatenate([y_mal, y_ben])
    class_sample_count = np.array(
        [len(np.where(y == t)[0]) for t in np.unique(y)])
    weight = 1. / class_sample_count
    samples_weight = []
    for t in range(len(y) - 1):
        samples_weight.append(weight[int(y[t])])
    Sampler = sampler.WeightedRandomSampler(samples_weight,
                                            len(samples_weight))
    data = TensorDataset(torch.from_numpy(x), torch.from_numpy(y))
    data_loader = DataLoader(data,
                             batch_size=batch_size,
                             sampler=Sampler,
                             drop_last=True)
    return data_loader


# if __name__ == "__main__":
示例#22
0
    def make_train_valid_loaders(self,
                                 distributed=False
                                 ) -> Tuple[DataLoader, DataLoader]:
        train_dataset, valid_dataset = self.make_train_valid_datasets()

        train_weights = torch.DoubleTensor(
            [1.0] * len(train_dataset))  # uniform sampling
        train_sampler = sampler.WeightedRandomSampler(
            weights=train_weights,
            num_samples=self._data_params['batch_size'] *
            self._data_params['steps_per_epoch'],
        )
        train_loader = self._make_loader(train_dataset,
                                         train_sampler,
                                         mode='train',
                                         distributed=distributed)
        valid_loader = self._make_loader(
            valid_dataset,
            sampler.SequentialSampler(valid_dataset),
            mode='valid',
            distributed=distributed,
        )
        return train_loader, valid_loader
示例#23
0
def get_dataloader_obj(data_dir,
                       data_transforms,
                       weights,
                       num_samples,
                       is_slr=False,
                       data_types=['train', 'test', 'val'],
                       bs=4):
    try:
        slr = sampler.WeightedRandomSampler(weights, num_samples)
        image_datasets = {
            x: datasets.ImageFolder(os.path.join(data_dir, x),
                                    data_transforms[x])
            for x in data_types
        }
        dataloaders = {
            x: DataLoader(image_datasets[x],
                          batch_size=bs,
                          shuffle=True,
                          num_workers=bs)
            for x in data_types
        }
        if is_slr:
            dataloaders = {
                x: DataLoader(image_datasets[x],
                              batch_size=bs,
                              sampler=slr,
                              num_workers=bs)
                for x in data_types
            }
        dsizes = {x: len(image_datasets[x]) for x in data_types}
        class_names = image_datasets['train'].classes
        dev = device("cuda:0" if cuda.is_available() else "cpu")
        return dataloaders, dsizes, class_names, dev
    except Exception as e:
        print(traceback.format_exc())
        raise e
示例#24
0
def main(train_root, train_csv, train_split, val_root, val_csv, val_split,
         epochs, aug, model_name, batch_size, num_workers, val_samples,
         early_stopping_patience, n_classes, weighted_loss, balanced_loader,
         _run):
    assert (model_name
            in ('inceptionv4', 'resnet152', 'densenet161', 'senet154'))

    AUGMENTED_IMAGES_DIR = os.path.join(fs_observer.dir, 'images')
    CHECKPOINTS_DIR = os.path.join(fs_observer.dir, 'checkpoints')
    BEST_MODEL_PATH = os.path.join(CHECKPOINTS_DIR, 'model_best.pth')
    LAST_MODEL_PATH = os.path.join(CHECKPOINTS_DIR, 'model_last.pth')
    for directory in (AUGMENTED_IMAGES_DIR, CHECKPOINTS_DIR):
        os.makedirs(directory)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_name == 'inceptionv4':
        model = ptm.inceptionv4(num_classes=1000, pretrained='imagenet')
        model.last_linear = nn.Linear(model.last_linear.in_features, n_classes)
        aug['size'] = 299
        aug['mean'] = model.mean
        aug['std'] = model.std
    elif model_name == 'resnet152':
        model = models.resnet152(pretrained=True)
        model.fc = nn.Linear(model.fc.in_features, n_classes)
        aug['size'] = 224
        aug['mean'] = [0.485, 0.456, 0.406]
        aug['std'] = [0.229, 0.224, 0.225]
    elif model_name == 'densenet161':
        model = models.densenet161(pretrained=True)
        model.classifier = nn.Linear(model.classifier.in_features, n_classes)
        aug['size'] = 224
        aug['mean'] = [0.485, 0.456, 0.406]
        aug['std'] = [0.229, 0.224, 0.225]
    elif model_name == 'senet154':
        model = ptm.senet154(num_classes=1000, pretrained='imagenet')
        model.last_linear = nn.Linear(model.last_linear.in_features, n_classes)
        aug['size'] = model.input_size[1]
        aug['mean'] = model.mean
        aug['std'] = model.std
    model.to(device)

    augs = Augmentations(**aug)
    model.aug_params = aug

    train_ds = CSVDatasetWithName(train_root,
                                  train_csv,
                                  'image',
                                  'label',
                                  transform=augs.tf_transform,
                                  add_extension='.jpg',
                                  split=train_split)
    val_ds = CSVDatasetWithName(val_root,
                                val_csv,
                                'image',
                                'label',
                                transform=augs.tf_transform,
                                add_extension='.jpg',
                                split=val_split)

    datasets = {'train': train_ds, 'val': val_ds}

    if balanced_loader:
        data_sampler = sampler.WeightedRandomSampler(train_ds.sampler_weights,
                                                     len(train_ds))
        shuffle = False
    else:
        data_sampler = None
        shuffle = True

    dataloaders = {
        'train':
        DataLoader(datasets['train'],
                   batch_size=batch_size,
                   shuffle=shuffle,
                   num_workers=num_workers,
                   sampler=data_sampler,
                   worker_init_fn=set_seeds),
        'val':
        DataLoader(datasets['val'],
                   batch_size=batch_size,
                   shuffle=False,
                   num_workers=num_workers,
                   worker_init_fn=set_seeds),
    }

    if weighted_loss:
        criterion = nn.CrossEntropyLoss(
            weight=torch.Tensor(datasets['train'].class_weights_list).cuda())
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = optim.SGD(model.parameters(),
                          lr=0.001,
                          momentum=0.9,
                          weight_decay=0.001)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.1,
                                                     min_lr=1e-5,
                                                     patience=10)

    metrics = {
        'train': pd.DataFrame(columns=['epoch', 'loss', 'acc']),
        'val': pd.DataFrame(columns=['epoch', 'loss', 'acc'])
    }

    best_val_loss = 1000.0
    epochs_without_improvement = 0
    batches_per_epoch = None

    for epoch in range(epochs):
        print('train epoch {}/{}'.format(epoch + 1, epochs))
        epoch_train_result = train_epoch(device, model, dataloaders, criterion,
                                         optimizer, 'train', batches_per_epoch)

        metrics['train'] = metrics['train'].append(
            {
                **epoch_train_result, 'epoch': epoch
            }, ignore_index=True)
        print('train', epoch_train_result)

        epoch_val_result = train_epoch(device, model, dataloaders, criterion,
                                       optimizer, 'val', batches_per_epoch)

        metrics['val'] = metrics['val'].append(
            {
                **epoch_val_result, 'epoch': epoch
            }, ignore_index=True)
        print('val', epoch_val_result)

        scheduler.step(epoch_val_result['loss'])

        if epoch_val_result['loss'] < best_val_loss:
            best_val_loss = epoch_val_result['loss']
            epochs_without_improvement = 0
            torch.save(model, BEST_MODEL_PATH)
            print('Best loss at epoch {}'.format(epoch))
        else:
            epochs_without_improvement += 1

        print('-' * 40)

        if epochs_without_improvement > early_stopping_patience:
            torch.save(model, LAST_MODEL_PATH)
            break

        if epoch == (epochs - 1):
            torch.save(model, LAST_MODEL_PATH)

    for phase in ['train', 'val']:
        metrics[phase].epoch = metrics[phase].epoch.astype(int)
        metrics[phase].to_csv(os.path.join(fs_observer.dir, phase + '.csv'),
                              index=False)

    print('Best validation loss: {}'.format(best_val_loss))

    # TODO: return more metrics
    return {'max_val_acc': metrics['val']['acc'].max()}
示例#25
0
def get_dataloader_train(args, root, image_list):
    # kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
    kwargs = {'num_workers': args.num_workers, 'pin_memory': args.pin_memory}

    if (args.architecture in ['resnet_i48_18']):

        dataset = ds.ClassificationDataset(
            root,
            image_list,
            transform=transforms.Compose([
                transforms.Resize(60),
                transforms.RandomRotation(3),
                transforms.RandomCrop(48),
                # transforms.RandomResizedCrop(48, scale = (0.72,0.76) , ratio = (1.0,1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.5, ), (0.28, ))
            ]),
            is_grey=False)

    elif args.architecture in ['resnet_i24_34']:
        dataset = ds.ClassificationDataset(
            root,
            image_list,
            transform=transforms.Compose([
                transforms.Resize(28),
                transforms.RandomRotation(3),
                transforms.RandomCrop(24),
                # transforms.RandomResizedCrop(48, scale = (0.72,0.76) , ratio = (1.0,1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])
            ]),
            is_grey=False)

    else:
        dataset = ds.ClassificationDataset(
            root,
            image_list,
            transform=transforms.Compose([
                transforms.Resize(256),
                transforms.RandomRotation(3),
                # transforms.RandomCrop(224),
                transforms.RandomResizedCrop(224,
                                             scale=(0.74, 0.85),
                                             ratio=(1.0, 1.0)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])
            ]),
        )

    # Balancing the classes !!!!!!
    prob = np.zeros(args.num_classes)
    for i in range(len(dataset)):
        cur_class = dataset.labels[i]
        prob[cur_class] += 1
    prob = 1.0 / prob

    reciprocal_weights = np.zeros(len(dataset))
    epoch_length = args.epoch_length
    for i in range(len(dataset)):
        label = dataset.labels[i]
        reciprocal_weights[i] = prob[label]
    weights = torch.from_numpy(reciprocal_weights)

    weighted_sampler = sampler.WeightedRandomSampler(weights, epoch_length)
    loader = DataLoader(dataset,
                        batch_size=args.batch_size,
                        sampler=weighted_sampler,
                        **kwargs)
    # loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)

    return loader
示例#26
0
    }
else:
    dsets = {
        x: MyImageFolder(root=args.data_root,
                         idx_file=idx_files[x],
                         transform=data_transforms[x])
        for x in data_splits
    }

shuffle_options = {'train': True, 'val': False, 'test': False}

if args.weighted_sample:
    sample_weight = dsets['train'].get_sample_weights()
    samplers = {
        'train':
        sampler.WeightedRandomSampler(weights=sample_weight,
                                      num_samples=len(dsets['train'])),
        'val':
        None,
        'test':
        None
    }
else:  # RandomSampler in train phase, SequentialSampler in val and test phase.
    samplers = {'train': None, 'val': None, 'test': None}

dset_loaders = {
    x: torch.utils.data.DataLoader(
        dsets[x],
        batch_size=args.batch_size,
        shuffle=shuffle_options[x],
        sampler=samplers[
            x],  #  If specified, the 'shuffle' argument is ignored
示例#27
0
def train(arguments):

    # Parse input arguments
    json_filename = arguments.config
    network_debug = arguments.debug

    # Load options
    json_opts = json_file_to_pyobj(json_filename)
    train_opts = json_opts.training

    # Architecture type
    arch_type = train_opts.arch_type

    # Setup Dataset and Augmentation
    ds_class = get_dataset(arch_type)
    ds_path = get_dataset_path(arch_type, json_opts.data_path)
    ds_transform = get_dataset_transformation(arch_type,
                                              opts=json_opts.augmentation)

    # Setup the NN Model
    model = get_model(json_opts.model)
    if network_debug:
        print('# of pars: ', model.get_number_parameters())
        print('fp time: {0:.3f} sec\tbp time: {1:.3f} sec per sample'.format(
            *model.get_fp_bp_time()))
        exit()

    # Setup Data Loader
    num_workers = train_opts.num_workers if hasattr(train_opts,
                                                    'num_workers') else 16
    train_dataset = ds_class(ds_path,
                             split='train',
                             transform=ds_transform['train'],
                             preload_data=train_opts.preloadData)
    valid_dataset = ds_class(ds_path,
                             split='val',
                             transform=ds_transform['valid'],
                             preload_data=train_opts.preloadData)
    test_dataset = ds_class(ds_path,
                            split='test',
                            transform=ds_transform['valid'],
                            preload_data=train_opts.preloadData)

    # create sampler
    if train_opts.sampler == 'stratified':
        print('stratified sampler')
        train_sampler = StratifiedSampler(train_dataset.labels,
                                          train_opts.batchSize)
        batch_size = 52
    elif train_opts.sampler == 'weighted2':
        print('weighted sampler with background weight={}x'.format(
            train_opts.bgd_weight_multiplier))
        # modify and increase background weight
        weight = train_dataset.weight
        bgd_weight = np.min(weight)
        weight[abs(weight - bgd_weight) <
               1e-8] = bgd_weight * train_opts.bgd_weight_multiplier
        train_sampler = sampler.WeightedRandomSampler(
            weight, len(train_dataset.weight))
        batch_size = train_opts.batchSize
    else:
        print('weighted sampler')
        train_sampler = sampler.WeightedRandomSampler(
            train_dataset.weight, len(train_dataset.weight))
        batch_size = train_opts.batchSize

    # loader
    train_loader = DataLoader(dataset=train_dataset,
                              num_workers=num_workers,
                              batch_size=batch_size,
                              sampler=train_sampler)
    valid_loader = DataLoader(dataset=valid_dataset,
                              num_workers=num_workers,
                              batch_size=train_opts.batchSize,
                              shuffle=True)
    test_loader = DataLoader(dataset=test_dataset,
                             num_workers=num_workers,
                             batch_size=train_opts.batchSize,
                             shuffle=True)

    # Visualisation Parameters
    visualizer = Visualiser(json_opts.visualisation, save_dir=model.save_dir)
    error_logger = ErrorLogger()

    # Training Function
    track_labels = np.arange(len(train_dataset.label_names))
    model.set_labels(track_labels)
    model.set_scheduler(train_opts)

    if hasattr(model, 'update_state'):
        model.update_state(0)

    for epoch in range(model.which_epoch, train_opts.n_epochs):
        print('(epoch: %d, total # iters: %d)' % (epoch, len(train_loader)))

        # # # --- Start ---
        # import matplotlib.pyplot as plt
        # plt.ion()
        # plt.figure()
        # target_arr = np.zeros(14)
        # # # --- End ---

        # Training Iterations
        for epoch_iter, (images, labels) in tqdm(enumerate(train_loader, 1),
                                                 total=len(train_loader)):
            # Make a training update
            model.set_input(images, labels)
            model.optimize_parameters()

            if epoch == (train_opts.n_epochs - 1):
                import time
                time.sleep(36000)

            if train_opts.max_it == epoch_iter:
                break

            # # # --- visualise distribution ---
            # for lab in labels.numpy():
            #     target_arr[lab] += 1
            # plt.clf(); plt.bar(train_dataset.label_names, target_arr); plt.pause(0.01)
            # # # --- End ---

            # Visualise predictions
            if epoch_iter <= 100:
                visuals = model.get_current_visuals()
                visualizer.display_current_results(visuals,
                                                   epoch=epoch,
                                                   save_result=False)

            # Error visualisation
            errors = model.get_current_errors()
            error_logger.update(errors, split='train')

        # Validation and Testing Iterations
        pr_lbls = []
        gt_lbls = []
        for loader, split in zip([valid_loader, test_loader],
                                 ['validation', 'test']):
            model.reset_results()

            for epoch_iter, (images, labels) in tqdm(enumerate(loader, 1),
                                                     total=len(loader)):

                # Make a forward pass with the model
                model.set_input(images, labels)
                model.validate()

                # Visualise predictions
                visuals = model.get_current_visuals()
                visualizer.display_current_results(visuals,
                                                   epoch=epoch,
                                                   save_result=False)

                if train_opts.max_it == epoch_iter:
                    break

            # Error visualisation
            errors = model.get_accumulated_errors()
            stats = model.get_classification_stats()
            error_logger.update({**errors, **stats}, split=split)

            # HACK save validation error
            if split == 'validation':
                valid_err = errors['CE']

        # Update the plots
        for split in ['train', 'validation', 'test']:
            # exclude bckground
            #track_labels = np.delete(track_labels, 3)
            #show_labels = train_dataset.label_names[:3] + train_dataset.label_names[4:]
            show_labels = train_dataset.label_names
            visualizer.plot_current_errors(epoch,
                                           error_logger.get_errors(split),
                                           split_name=split,
                                           labels=show_labels)
            visualizer.print_current_errors(epoch,
                                            error_logger.get_errors(split),
                                            split_name=split)
        error_logger.reset()

        # Save the model parameters
        if epoch % train_opts.save_epoch_freq == 0:
            model.save(epoch)

        if hasattr(model, 'update_state'):
            model.update_state(epoch)

        # Update the model learning rate
        model.update_learning_rate(metric=valid_err, epoch=epoch)
print '#vehicles found: ', data_splitter.num_of_vehicles

# Setup training and validation data loaders.
flag_data_augmentation = False
flag_with_intensities = False
hdf5_dataset = HDF5Dataset(data_splitter,
                           'training',
                           flag_data_augmentation=flag_data_augmentation,
                           flag_with_intensities=flag_with_intensities)
hdf5_dataset_val = HDF5Dataset(data_splitter,
                               'validation',
                               flag_data_augmentation=False,
                               flag_with_intensities=flag_with_intensities)

# Weighted sampling: sample pedestrians more than vehicles.
sampler = sampler.WeightedRandomSampler(sample_weights, len(sample_weights))
train_loader = DataLoader(hdf5_dataset,
                          batch_size=opt.batchSize,
                          sampler=sampler,
                          shuffle=False)  #, num_workers=2)
val_loader = DataLoader(hdf5_dataset_val,
                        batch_size=opt.batchSize,
                        shuffle=False)

# print(len(dataset), len(test_dataset))
num_classes = 2
# print('classes', num_classes)

try:
    os.makedirs(opt.outf)
except OSError:
示例#29
0
    print("Training set distribution:")
    print_distribution(ids_train)

    print("Validation set distribution:")
    print_distribution(ids_val)

    classes_train = [get_class(idx.split('/')[-2]) for idx in ids_train]
    class_weight = class_weight.compute_class_weight('balanced',
                                                     np.unique(classes_train),
                                                     classes_train)
    classes_val = [get_class(idx.split('/')[-2]) for idx in ids_val]

    weights = [class_weight[i_class] for i_class in classes_train]
    weights = torch.DoubleTensor(weights)
    train_sampler = sampler.WeightedRandomSampler(weights, len(weights))

    weights = [class_weight[i_class] for i_class in classes_val]
    weights = torch.DoubleTensor(weights)
    val_sampler = sampler.WeightedRandomSampler(weights, len(weights))

    train_dataset = IEEECameraDataset(ids_train,
                                      crop_size=CROP_SIZE,
                                      training=True,
                                      model=m_names)
    val_dataset = IEEECameraDataset(ids_val,
                                    crop_size=CROP_SIZE,
                                    training=False,
                                    model=m_names)

    train_loader = DataLoader(train_dataset,
示例#30
0
    def build(task, config, task_subset_name):
        """
        Static method returning particular sampler, depending on the name \
        provided in the list of parameters & the specified task class.

        :param task: Instance of an object derived from the Task class.
        :type task: ``tasks.Task``

        :param config: Parameters used to instantiate the sampler.
        :type config: :py:class:`ptp.configuration.ConfigInterface`

        :param task_subset_name: Name of task subset (and associated TaskManager object)

        ..note::

            ``config`` should contains the exact (case-sensitive) class name of the sampler to instantiate.


        .. warning::

            ``torch.utils.data.sampler.BatchSampler``, \
            ``torch.utils.data.sampler.DistributedSampler`` are not supported yet.

        .. note::

            ``torch.utils.data.sampler.SubsetRandomSampler`` expects 'indices' to index a subset of the dataset. \
             Currently, the user can specify these indices using one of the following options:

            - Option 1: range.
                >>> indices = range(20)

            - Option 2: range as str.
                >>> range_str = '0, 20'

            - Option 3: list of indices.
                >>> yaml_list = yaml.load('[0, 2, 5, 10]')

            - Option 4: name of the file containing indices.
                >>> filename = "~/data/mnist/training_indices.txt"

        .. note::

            ``torch.utils.data.sampler.WeightedRandomSampler`` expercse additional parameter 'weights'.

        :return: Instance of a given sampler or ``None`` if the section not present or couldn't build the sampler.

        """
        # Initialize logger.
        logger = logging.initialize_logger('SamplerFactory')

        try:
            # Check presence of the typename attribute.
            if 'type' not in config:
                raise ConfigurationError(
                    "The sampler configuration section does not contain the key 'type'"
                )

            # Get the class typename.
            typename = config['type']
            logger.info(
                'Trying to instantiate the {} sampler object'.format(typename))

            ###########################################################################
            # Handle first special case: SubsetRandomSampler.
            if typename == 'SubsetRandomSampler':

                # Check presence of the typename attribute.
                if 'indices' not in config:
                    raise ConfigurationError(
                        "The sampler configuration section does not contain the key 'indices' "
                        "required by SubsetRandomSampler")

                # Get and process the indices.
                indices = config['indices']

                # Analyze the type.
                if type(indices) == str:
                    # Try to open the file.
                    try:
                        # from expanduser()'s doc: If the expansion fails or if the path does not begin
                        # with a tilde, the path is returned unchanged. -> So operation below should be safe.
                        file = open(os.path.expanduser(indices), "r")
                        # Read the file.
                        indices = file.readline()
                        file.close()

                    except Exception:
                        # Ok, this is not a file.
                        pass
                    finally:
                        # Try to process it as a string.
                        # Get the digits.
                        digits = indices.split(',')
                        indices = [int(x) for x in digits]
                else:
                    # Assume that type(indices) is a list of ints.
                    digits = indices

                # Finally, we got the list of digits.
                if len(digits) == 2:
                    # Create a range.
                    indices = range(int(digits[0]), int(digits[1]))
                # Else: use them as they are, including single index.

                # Check if indices are within range.
                if max(indices) >= len(task):
                    raise ConfigurationError(
                        "SubsetRandomSampler cannot work properly when indices are out of range ({}) "
                        "considering that there are {} samples in the task".
                        format(max(indices), len(task)))

                # Create the sampler object.
                sampler = pt_samplers.SubsetRandomSampler(indices)

            ###########################################################################
            # Handle second special case: WeightedRandomSampler.
            elif typename == 'WeightedRandomSampler':

                # Check presence of the attribute.
                if 'weights' not in config:
                    raise ConfigurationError(
                        "The sampler configuration section does not contain the key 'weights' "
                        "required by WeightedRandomSampler")

                # Load weights from file.
                weights = np.fromfile(os.path.expanduser(config['weights']),
                                      dtype=float,
                                      count=-1,
                                      sep=',')

                # Create sampler class.
                sampler = pt_samplers.WeightedRandomSampler(weights,
                                                            len(task),
                                                            replacement=True)

            ###########################################################################
            # Handle third special case: kFoldRandomSampler.
            elif typename == 'kFoldRandomSampler':

                # Check presence of the attribute.
                if 'folds' not in config:
                    raise ConfigurationError(
                        "The sampler configuration section does not contain the key 'folds' "
                        "required by kFoldRandomSampler")

                # Create indices, depending on the fold.
                folds = config["folds"]
                if folds < 2:
                    raise ConfigurationError(
                        "kFoldRandomSampler requires  at least two 'folds'")
                # Get epochs per fold (default: 1).
                epochs_per_fold = config.get("epochs_per_fold", 1)

                # Create the sampler object.
                sampler = ptp_samplers.kFoldRandomSampler(
                    len(task), folds, epochs_per_fold,
                    task_subset_name == 'training')

            ###########################################################################
            # Handle fourd special case: kFoldWeightedRandomSampler.
            elif typename == 'kFoldWeightedRandomSampler':

                # Check presence of the attribute.
                if 'weights' not in config:
                    raise ConfigurationError(
                        "The sampler configuration section does not contain the key 'weights' "
                        "required by kFoldWeightedRandomSampler")

                # Load weights from file.
                weights = np.fromfile(os.path.expanduser(config['weights']),
                                      dtype=float,
                                      count=-1,
                                      sep=',')

                # Check presence of the attribute.
                if 'folds' not in config:
                    raise ConfigurationError(
                        "The sampler configuration section does not contain the key 'folds' "
                        "required by kFoldWeightedRandomSampler")

                # Create indices, depending on the fold.
                folds = config["folds"]
                if folds < 2:
                    raise ConfigurationError(
                        "kFoldRandomSampler requires  at least two 'folds'")
                # Get epochs per fold (default: 1).
                epochs_per_fold = config.get("epochs_per_fold", 1)

                # Create the sampler object.
                sampler = ptp_samplers.kFoldWeightedRandomSampler(
                    weights, len(task), folds, epochs_per_fold,
                    task_subset_name == 'training')

            elif typename in ['BatchSampler', 'DistributedSampler']:
                # Sorry, don't support those. Yet;)
                raise ConfigurationError(
                    "Sampler Factory currently does not support the '{}' sampler. Please pick one of the others "
                    "or use defaults random sampling".format(typename))
            else:
                # Verify that the specified class is in the samplers package.
                if typename not in dir(pt_samplers):
                    raise ConfigurationError(
                        "Could not find the specified class '{}' in the samplers package"
                        .format(typename))

                # Get the sampler class.
                sampler_class = getattr(pt_samplers, typename)
                # Create "regular" sampler.
                sampler = sampler_class(task)

            # Return sampler.
            return sampler

        except ConfigurationError as e:
            logger.error(e)
            # Do not continue with invalid sampler.
            exit(-1)