예제 #1
0
def kfold_generator(args, splits, dataset):
    import torch.utils.data as data
    from sklearn.model_selection import KFold
    splitter = KFold(n_splits=splits, shuffle=True)
    for fold_index, (train_subset,
                     test_subset) in enumerate(splitter.split(dataset)):
        train_sampler = data.SubsetRandomSampler(train_subset)
        test_sampler = data.SubsetRandomSampler(test_subset)

        train_loader = torch.utils.data.DataLoader(
            dataset,
            sampler=train_sampler,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=args.num_workers,
            collate_fn=None)
        test_loader = torch.utils.data.DataLoader(dataset,
                                                  sampler=test_sampler,
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  num_workers=args.num_workers,
                                                  collate_fn=None)

        yield fold_index, train_loader, len(train_subset), test_loader, len(
            test_subset)
예제 #2
0
    def __init__(self,
                 data_file,
                 batch_size,
                 test_split,
                 shuffle_dataset,
                 random_seed,
                 validation_split=0):

        # Load tensor data
        data = torch.load(data_file)
        dataset = IndexTensorDataset(data['X'], data['y'])

        # Test / train split
        dataset_size = len(dataset)
        indices = list(range(dataset_size))
        split = int(np.floor(test_split * dataset_size))
        if shuffle_dataset:
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # Initialize Dataloaders
        train_sampler = data_utils.SubsetRandomSampler(train_indices)
        test_sampler = data_utils.SubsetRandomSampler(test_indices)

        self.train_loader = data_utils.DataLoader(dataset,
                                                  batch_size=batch_size,
                                                  sampler=train_sampler)
        self.test_loader = data_utils.DataLoader(dataset,
                                                 batch_size=batch_size,
                                                 sampler=test_sampler)
        self.isolates = data['isolates']
예제 #3
0
def execute_model(network_function, criterion, device, print_details=False):

    acc_list = []
    auc_list = []

    for _ in range(METRIC_COMPUTATION_ITER):
        network = network_function().to(device)
        optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE)

        df = Dataset(PATH)

        end = int(df.__len__())
        indices = list([i for i in range(0, end)])
        set_split = end - round(end * SET_RATIO)
        train_indices = indices[0:set_split]
        test_indices = indices[set_split:end]

        training_data = data.DataLoader(
            df,
            batch_size=TRAIN_BATCH_SIZE,
            sampler=data.SubsetRandomSampler(train_indices))
        test_data = data.DataLoader(
            df,
            batch_size=TEST_BATCH_SIZE,
            sampler=data.SubsetRandomSampler(test_indices))

        training_data_batches = len(training_data)
        test_data_batches = len(test_data)

        for epoch in range(EPOCH):
            running_loss = 0

            for i, batch in enumerate(training_data):
                inputs, labels = batch

                inputs, labels = inputs.to(device), labels.to(device)

                optimiser.zero_grad()

                outputs = network(inputs)

                loss = criterion(outputs, labels.type_as(outputs))
                loss.backward()

                optimiser.step()

                running_loss += loss.item()

                if print_details:
                    if i % training_data_batches == training_data_batches - 1:
                        print("Epoch : %2d, Loss : %.3f" %
                              (epoch + 1, running_loss))

        evaluate_model(network, training_data, 'training data', device)
        acc_tmp, auc_tmp = evaluate_model(network, test_data, 'test data',
                                          device)
        acc_list.append(acc_tmp)
        auc_list.append(auc_tmp)

    write_metrics(acc_list, auc_list)
예제 #4
0
def load_torch_data(dataset: tdata.Dataset, ratio: float, bs: int):
    """Prepare data from torch dataset for training and validation.
    Args:
        dataset (torch.utils.data.Dataset): loaded dataset
        ratio (float): split ratio
        bs (int): batch size

    Returns:
        A tuple of training data loader, validation data loader and
            a tuple of size containing training dataset size and validation
            dataset size respectively
    """
    dataset_size = len(dataset)

    # prepare for shuffle
    indices = np.arange(dataset_size)
    np.random.shuffle(indices)
    split_idx = int(np.floor(ratio * dataset_size))
    train_indices, val_indices = indices[split_idx:], indices[:split_idx]

    # split dataset
    train_sampler = tdata.SubsetRandomSampler(train_indices)
    val_sampler = tdata.SubsetRandomSampler(val_indices)
    train_loader = tdata.DataLoader(dataset, batch_size=bs, sampler=train_sampler)
    val_loader = tdata.DataLoader(dataset, batch_size=bs, sampler=val_sampler)

    return train_loader, val_loader, (len(train_indices), len(val_indices))
예제 #5
0
def split_dataset(dataset, batch_size):
    data_size = len(dataset)
    validation_split = .2
    shuffle = True
    random_seed = 42

    indices = list(range(data_size))
    #print(validation_split * data_size)
    split = int(np.floor(validation_split * data_size))
    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    train_sample = data.SubsetRandomSampler(train_indices)
    validation_sample = data.SubsetRandomSampler(val_indices)

    train_loader = data.DataLoader(dataset,
                                   batch_size=batch_size,
                                   sampler=train_sample,
                                   num_workers=8)
    val_loader = data.DataLoader(dataset,
                                 batch_size=batch_size,
                                 sampler=validation_sample,
                                 num_workers=8)

    return train_loader, val_loader
 def test_split(self):
     t0 = time()
     train_ids, test_ids = self.dataset.get_train_test_split()
     self.assertEqual(
         train_ids.size(0) + test_ids.size(0), len(self.dataset))
     self.dataset.transform = None
     train_loader = thd.DataLoader(
         self.dataset,
         batch_size=1,
         sampler=thd.SubsetRandomSampler(train_ids))
     test_loader = thd.DataLoader(self.dataset,
                                  batch_size=1,
                                  sampler=thd.SubsetRandomSampler(test_ids))
     loader_train_ids, _ = torch.tensor(
         [batch[2][0] for batch in train_loader]).sort()
     loader_test_ids, _ = torch.tensor(
         [batch[2][0] for batch in test_loader]).sort()
     self.assertEqual(
         loader_train_ids.eq(train_ids.sort()[0]).sum(), train_ids.size(0))
     self.assertEqual(
         loader_test_ids.eq(test_ids.sort()[0]).sum(), test_ids.size(0))
     # Check that test IDs do not leak with training
     self.assertEqual(
         np.intersect1d(loader_train_ids.numpy(),
                        test_ids.numpy()).shape[0], 0)
     self.assertEqual(
         np.intersect1d(loader_train_ids.numpy(),
                        loader_test_ids.numpy()).shape[0], 0)
     self.assertEqual(
         np.intersect1d(loader_test_ids.numpy(),
                        train_ids.numpy()).shape[0], 0)
     print("Split: %.2fs" % (time() - t0))
예제 #7
0
 def train_test_split (self, test_size=0.3, shuffle=True,random_state=None):
     """returns indices to split train/test"""
     d_i  = np.arange (self.n)
     train_i, test_i = train_test_split (d_i, test_size=test_size, shuffle=shuffle, stratify=self.target, random_state=random_state)
     train_s         = tud.SubsetRandomSampler (train_i)
     test_s          = tud.SubsetRandomSampler (test_i)
     return train_s, test_s
def get_loader(root_folder,
               batch_size=16,
               shuffle=False,
               num_workers=0,
               pin_memory=False):
    """
    Returns a data loader for the caltech 101 dataset
    """
    cal101_dset = get_dataset(root_folder)

    # train test split
    split_ratio = 0.2
    dataset_size = len(cal101_dset)
    indices = np.arange(dataset_size)
    np.random.shuffle(indices)
    split = int(np.floor(split_ratio * dataset_size))
    train_indices, val_indices = indices[split:], indices[:split]

    train_sampler = data.SubsetRandomSampler(train_indices)
    valid_sampler = data.SubsetRandomSampler(val_indices)

    train_loader = data.DataLoader(cal101_dset,
                                   batch_size=batch_size,
                                   shuffle=shuffle,
                                   num_workers=num_workers,
                                   sampler=train_sampler,
                                   pin_memory=pin_memory)
    validation_loader = data.DataLoader(cal101_dset,
                                        batch_size=batch_size,
                                        shuffle=shuffle,
                                        num_workers=num_workers,
                                        sampler=valid_sampler,
                                        pin_memory=pin_memory)

    return train_loader, validation_loader
예제 #9
0
def mean_teacher(
        dataset_root,
        supervised_ratio: float = 0.1,
        batch_size: int = 64,
        train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9),
        val_folds: tuple = (10, ),
        verbose=1,
        **kwargs,
):
    assert supervised_ratio <= 1.0
    """
    Load the UrbanSound dataset for student teacher framework.
    """
    audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio")
    metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata")

    all_folds = train_folds + val_folds

    # Create the dataset manager
    manager = DatasetManager(metadata_root,
                             audio_root,
                             folds=all_folds,
                             verbose=verbose)

    # validation subset
    val_dataset = Dataset(manager, folds=val_folds, cached=True)
    val_loader = torch_data.DataLoader(val_dataset,
                                       batch_size=batch_size,
                                       shuffle=True)

    # training subset
    train_dataset = Dataset(manager, folds=train_folds, cached=True)

    # Calc the size of the Supervised and Unsupervised batch
    s_idx, u_idx = train_dataset.split_s_u(supervised_ratio)
    nb_s_file = len(s_idx)
    nb_u_file = len(u_idx)

    s_batch_size = int(np.floor(batch_size * supervised_ratio))
    u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio)))

    print("s_batch_size: ", s_batch_size)
    print("u_batch_size: ", u_batch_size)

    sampler_s = torch_data.SubsetRandomSampler(s_idx)
    sampler_u = torch_data.SubsetRandomSampler(u_idx)

    train_s_loader = torch_data.DataLoader(train_dataset,
                                           batch_size=s_batch_size,
                                           sampler=sampler_s)
    train_u_loader = torch_data.DataLoader(train_dataset,
                                           batch_size=u_batch_size,
                                           sampler=sampler_u)

    train_loader = ZipCycle([train_s_loader, train_u_loader])

    return manager, train_loader, val_loader
예제 #10
0
def dct(dataset_root,
        supervised_ratio: float = 0.1,
        batch_size: int = 100,
        train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9),
        val_folds: tuple = (10, ),
        verbose=1,
        **kwargs):
    """
    Load the urbansound dataset for Deep Co Training system.
    """
    audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio")
    metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata")

    all_folds = train_folds + val_folds

    # Create the dataset manager
    manager = DatasetManager(metadata_root,
                             audio_root,
                             folds=all_folds,
                             verbose=verbose)

    # prepare the default dataset
    train_dataset = Dataset(manager, folds=train_folds, cached=True)
    val_dataset = Dataset(manager, folds=val_folds, cached=True)

    # split the training set into a supervised and unsupervised sets
    s_idx, u_idx = train_dataset.split_s_u(supervised_ratio)

    # Calc the size of the Supervised and Unsupervised batch
    nb_s_file = len(s_idx)
    nb_u_file = len(u_idx)

    s_batch_size = int(np.floor(batch_size * supervised_ratio))
    u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio)))

    # create the sampler, the loader and "zip" them
    sampler_s1 = torch_data.SubsetRandomSampler(s_idx)
    sampler_s2 = torch_data.SubsetRandomSampler(s_idx)
    sampler_u = torch_data.SubsetRandomSampler(u_idx)

    train_loader_s1 = torch_data.DataLoader(train_dataset,
                                            batch_size=s_batch_size,
                                            sampler=sampler_s1)
    train_loader_s2 = torch_data.DataLoader(train_dataset,
                                            batch_size=s_batch_size,
                                            sampler=sampler_s2)
    train_loader_u = torch_data.DataLoader(train_dataset,
                                           batch_size=u_batch_size,
                                           sampler=sampler_u)

    train_loader = ZipCycle([train_loader_s1, train_loader_s2, train_loader_u])
    val_loader = torch_data.DataLoader(val_dataset,
                                       batch_size=batch_size,
                                       shuffle=True)

    return manager, train_loader, val_loader
예제 #11
0
    def mean_teacher(dataset_root,
                     supervised_ratio: float = 0.1,
                     batch_size: int = 128,
                     train_folds: tuple = (1, 2, 3, 4),
                     val_folds: tuple = (5, ),
                     train_transform: Module = None,
                     val_transform: Module = None,
                     **kwargs) -> Tuple[None, DataLoader, DataLoader]:
        """
        Load the cifar10 dataset for Deep Co Training system.
        """
        # Recover extra commun arguments
        num_workers = kwargs.get("num_workers", 0)
        pin_memory = kwargs.get("pin_memory", False)
        loader_args = dict(
            num_workers=num_workers,
            pin_memory=pin_memory,
        )

        dataset_path = os.path.join(dataset_root)

        # validation subset
        val_dataset = cls(root=dataset_path,
                          folds=val_folds,
                          download=True,
                          transform=val_transform)
        val_loader = torch_data.DataLoader(val_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           **loader_args)

        # Training subset
        train_dataset = cls(root=dataset_path,
                            folds=train_folds,
                            download=True,
                            transform=train_transform)
        s_idx, u_idx = _split_s_u(train_dataset,
                                  supervised_ratio,
                                  nb_class=train_dataset.nb_class)

        s_batch_size = int(np.floor(batch_size * supervised_ratio))
        u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio)))

        sampler_s = torch_data.SubsetRandomSampler(s_idx)
        sampler_u = torch_data.SubsetRandomSampler(u_idx)

        train_s_loader = torch_data.DataLoader(train_dataset,
                                               batch_size=s_batch_size,
                                               sampler=sampler_s)
        train_u_loader = torch_data.DataLoader(train_dataset,
                                               batch_size=u_batch_size,
                                               sampler=sampler_u)

        train_loader = ZipCycle([train_s_loader, train_u_loader])

        return None, train_loader, val_loader
예제 #12
0
def load_cifar10(root_dir=None,
                 batch_size=20,
                 shuffle=True,
                 transform=None,
                 download=True):
    dataset_type = "continuous"

    if root_dir is None:
        root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets'
        root_dir = str(root_dir)

    if transform is None:
        transform = transforms.ToTensor()

    train_dataset = datasets.CIFAR10(root_dir,
                                     transform=transform,
                                     download=download)

    size_train = len(train_dataset)
    indices = list(range(size_train))
    split = int(np.floor(0.2 * size_train))

    if split % batch_size != 0:
        raise ValueError(
            f'The batch size: {batch_size} does not divide the size of '
            f'the train_dataset: {size_train - split} or the size of the validation_dataset: {split}'
        )

    if shuffle:
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = data_utils.SubsetRandomSampler(train_idx)
    valid_sampler = data_utils.SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               shuffle=shuffle)

    valid_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=valid_sampler,
                                               shuffle=shuffle)

    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        root_dir,
        train=False,
        transform=transforms.Compose([transforms.ToTensor(), transform])),
                                              batch_size=batch_size,
                                              shuffle=shuffle)

    return train_loader, test_loader, valid_loader, dataset_type
예제 #13
0
    def get_preprocessed_data(self):
        dataset = self.get_dataset()
        train_indices, val_indices = self.get_split(dataset)
        train_sampler = data.SubsetRandomSampler(train_indices)
        valid_sampler = data.SubsetRandomSampler(val_indices)

        train_loader = data.DataLoader(dataset,
                                       batch_size=batch_size,
                                       sampler=train_sampler)
        validation_loader = data.DataLoader(dataset,
                                            batch_size=batch_size,
                                            sampler=valid_sampler)
        return train_loader, validation_loader
예제 #14
0
def loadDataForLocal(want_to_test):

    training_dataset = NSynth(
        "./nsynth-test",
        transform=toFloat,
        blacklist_pattern=["synth_lead"],  # blacklist string instrument
        categorical_field_list=["instrument_family", "instrument_source"])

    # Splitting training dataset into training and validation and testing
    num_train = len(training_dataset)
    indices = list(range(num_train))
    splitVal = int(np.floor(VALIDATION_SPLIT * num_train))
    splitTest = int(np.floor(TESTING_SPLIT * num_train)) + splitVal

    # Make sure you get same numbers every time when rand_seed = 0
    np.random.seed(seed=RAND_SEED)

    # Shuffle the indices
    np.random.shuffle(indices)

    # Get training set index and validation set index
    validation_idx, test_idx, train_idx = indices[splitVal:], \
                                          indices[splitVal:splitTest], \
                                          indices[splitTest:]

    # create samplers
    train_sampler = data_utils.SubsetRandomSampler(train_idx)
    test_sampler = data_utils.SubsetRandomSampler(test_idx)
    validation_sampler = data_utils.SubsetRandomSampler(validation_idx)

    # create dataLoaders
    train_loader = torch.utils.data.DataLoader(dataset=training_dataset,
                                               batch_size=BATCH_SIZE,
                                               sampler=train_sampler)

    validation_loader = torch.utils.data.DataLoader(dataset=training_dataset,
                                                    batch_size=1,
                                                    sampler=validation_sampler)

    test_loader = torch.utils.data.DataLoader(dataset=training_dataset,
                                              batch_size=1,
                                              sampler=test_sampler)

    if want_to_test == '1':
        test_loader = torch.utils.data.DataLoader(dataset=training_dataset,
                                                  batch_size=1)

    print('Finished preparing data loaders for local testing')

    return train_loader, validation_loader, test_loader
예제 #15
0
def split_data(data, validation_split=0.1, batch_size=100):
    dataset_size = len(data)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    random_seed = 37
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    train_loader = utils.DataLoader(data, batch_size=batch_size,
                                    sampler=utils.SubsetRandomSampler(train_indices))
    validation_loader = utils.DataLoader(data, batch_size=1,
                                         sampler=utils.SubsetRandomSampler(val_indices))
    return train_loader, validation_loader
예제 #16
0
def get_loaders(dataset, config, logger, n=1):
    if n == 2:
        t0 = time.time()
        train_indices = dataset.get_train_indices(p=config.PAIR_SPLIT_P)
        logger.info("Indices created at %.2fs" % (time.time() - t0))
        loaders = {
            "train":
            thd.DataLoader(d.PairExtension(dataset),
                           batch_size=config.BATCH_SIZE,
                           sampler=thd.SubsetRandomSampler(train_indices),
                           collate_fn=d.pair_collate,
                           num_workers=config.NW),
        }
    elif n == 3:
        train_indices, test_indices = dataset.get_train_test_split()
        if dataset.transform is None:
            collate_fn = d.triple_collate
        else:
            collate_fn = d.triple_collate_pil
        loaders = {
            "train":
            thd.DataLoader(d.TripletExtension(dataset),
                           batch_size=config.BATCH_SIZE,
                           sampler=thd.SubsetRandomSampler(train_indices),
                           collate_fn=collate_fn,
                           num_workers=config.NW),
        }
    else:
        train_indices, test_indices = dataset.get_train_test_split()
        if dataset.transform is None:
            collate_fn = d.fast_collate
        else:
            collate_fn = d.fast_collate_pil
        loaders = {
            "train":
            thd.DataLoader(dataset,
                           batch_size=config.EVAL_BATCH_SIZE,
                           sampler=thd.SubsetRandomSampler(train_indices),
                           collate_fn=collate_fn,
                           num_workers=config.NW),
            "test":
            thd.DataLoader(dataset,
                           batch_size=config.EVAL_BATCH_SIZE,
                           sampler=thd.SubsetRandomSampler(test_indices),
                           collate_fn=collate_fn,
                           num_workers=config.NW)
        }
    return loaders
예제 #17
0
def create_dataloader_train_cv(
    kaldi_string,
    caption_json_path,
    vocab_path,
    transform=None,
    shuffle=True,
    batch_size: int = 16,
    num_workers=1,
    percent=90,
):
    dataset = SJTUDataLoader(kaldi_string=kaldi_string,
                             caption_json_path=caption_json_path,
                             vocab_path=vocab_path,
                             transform=transform)
    all_indices = torch.arange(len(dataset))
    num_train_indices = int(len(all_indices) * percent / 100)
    train_indices = all_indices[:num_train_indices]
    cv_indices = all_indices[num_train_indices:]
    trainsampler = data.SubsetRandomSampler(train_indices)
    # Do not shuffle
    cvsampler = SubsetSampler(cv_indices)

    return data.DataLoader(dataset,
                           batch_size=batch_size,
                           num_workers=num_workers,
                           collate_fn=collate_fn,
                           sampler=trainsampler), data.DataLoader(
                               dataset,
                               batch_size=batch_size,
                               num_workers=num_workers,
                               collate_fn=collate_fn,
                               sampler=cvsampler)
예제 #18
0
def get_data_and_train_model(file_path, network, criterion, optimiser, device,
                             print_details):
    df = Dataset(file_path)

    end = int(df.__len__())
    indices = list([i for i in range(0, end)])
    set_split = end - round(end * SET_RATIO)
    train_indices = indices[0:set_split]
    test_indices = indices[set_split:end]

    training_data = data.DataLoader(
        df,
        batch_size=TRAIN_BATCH_SIZE,
        sampler=data.SubsetRandomSampler(train_indices))
    test_data = data.DataLoader(df,
                                batch_size=TEST_BATCH_SIZE,
                                sampler=data.SubsetRandomSampler(test_indices))

    training_data_batches = len(training_data)
    test_data_batches = len(test_data)

    for epoch in range(EPOCH):
        running_loss = 0

        for i, batch in enumerate(training_data):
            inputs, labels = batch

            inputs, labels = inputs.to(device), labels.to(device)

            optimiser.zero_grad()

            outputs = network(inputs)

            loss = criterion(outputs, labels.type_as(outputs))
            loss.backward()

            optimiser.step()

            running_loss += loss.item()

            if print_details:
                if i % training_data_batches == training_data_batches - 1:
                    print("Epoch : %2d, Loss : %.3f" %
                          (epoch + 1, running_loss))

    return training_data, test_data
예제 #19
0
    def set_training_data(self, job, train_inds, test_inds, labels, data):
        """Construct generators out of the dataset for training, validation,
        and expectation maximization.

        Parameters
        ----------
        job : dict
            See training_dict.tx for all keys.
        train_inds : np.ndarray
            Indices in data that are to be trained on
        test_inds : np.ndarray
            Indices in data that are to be validated on
        labels : np.ndarray,
            classification labels used for training
        data : np.ndarray, shape=(n_frames,3*n_atoms) OR str to path
            All data
        """

        batch_size = job['batch_size']
        cpu_cores = job['em_n_cores']
        test_batch_size = job['test_batch_size']
        em_batch_size = job['em_batch_size']
        subsample = job['subsample']
        data_dir = job["data_dir"]

        n_train_inds = len(train_inds)
        random_inds = np.random.choice(np.arange(n_train_inds),int(n_train_inds/subsample),replace=False)
        sampler=torch_data.SubsetRandomSampler(random_inds)

        params_t = {'batch_size': batch_size,
                  'shuffle':False,
                  'num_workers': cpu_cores,
                  'sampler': sampler}

        params_v = {'batch_size': test_batch_size,
                  'shuffle':True,
                  'num_workers': cpu_cores}

        params_e = {'batch_size': em_batch_size,
                  'shuffle':True,
                  'num_workers': cpu_cores}

        n_snapshots = len(train_inds) + len(test_inds)

        training_set = Dataset(train_inds, labels, data)
        training_generator = torch_data.DataLoader(training_set, **params_t)

        validation_set = Dataset(test_inds, labels, data)
        validation_generator = torch_data.DataLoader(validation_set, **params_v)

        em_set = Dataset(train_inds, labels, data)
        em_generator = torch_data.DataLoader(em_set, **params_e)

        return training_generator, validation_generator, em_generator
예제 #20
0
def load_bedrooms(root_dir=None, batch_size=20, shuffle=True, transform=None):
    if root_dir is None:
        root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets'
        root_dir = str(root_dir)

    if transform is None:
        transform = transforms.ToTensor()

    train_dataset = datasets.LSUN(root_dir,
                                  classes=['classroom_train'],
                                  transform=transform)

    size_train = len(train_dataset)
    indices = list(range(size_train))
    split = int(np.floor(0.2 * size_train))

    if shuffle:
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = data_utils.SubsetRandomSampler(train_idx)
    valid_sampler = data_utils.SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               shuffle=shuffle)

    valid_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=valid_sampler,
                                               shuffle=shuffle)

    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        root_dir,
        train=False,
        transform=transforms.Compose([transforms.ToTensor(), transform])),
                                              batch_size=batch_size,
                                              shuffle=shuffle)

    return train_loader, test_loader, valid_loader
예제 #21
0
def load_CelebA(root_dir=None, batch_size=50, shuffle=True):
    dataset_type = "continuous"
    drop_to_make_batch_size_work = 99

    if root_dir is None:
        root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets'

    dataset = np.load(root_dir / 'CelebA' / 'celebA.npy')

    idx = np.arange(len(dataset))
    if shuffle:
        np.random.shuffle(idx)
    idx = idx[:-drop_to_make_batch_size_work]

    train_size = int(len(idx) * 0.1)
    idx_train, idx_test = idx[:train_size], idx[train_size:]

    val_size = int(len(idx_train) * 0.1)
    idx_train, idx_valid = idx[:-val_size], idx[-val_size:]

    sampler_train = data_utils.SubsetRandomSampler(idx_train)
    sampler_test = data_utils.SubsetRandomSampler(idx_test)
    sampler_valid = data_utils.SubsetRandomSampler(idx_valid)

    loader_train = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=sampler_train,
                                               shuffle=shuffle)

    loader_valid = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=sampler_valid,
                                               shuffle=shuffle)

    loader_test = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              sampler=sampler_test,
                                              shuffle=shuffle)

    return loader_train, loader_valid, loader_test, dataset_type
예제 #22
0
def mean_teacher(
        dataset_root,
        supervised_ratio: float = 0.1,
        batch_size: int = 128,

        train_transform: Module = None,
        val_transform: Module = None,

        **kwargs) -> Tuple[DataLoader, DataLoader]:
    """
    Load the SpeechCommand for a student teacher learning
    """
    loader_args = dict(
        num_workers=kwargs.get("num_workers", 0),
        pin_memory=kwargs.get("pin_memory", False),
    )
    dataset_path = os.path.join(dataset_root)

    # validation subset
    val_dataset = SpeechCommands(root=dataset_path, subset="validation", transform=train_transform, download=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, **loader_args)

    # Training subset
    train_dataset = SpeechCommands(root=dataset_path, subset="train", transform=val_transform, download=True)
    s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio)
    nb_s_file = len(s_idx)
    nb_u_file = len(u_idx)

    s_batch_size = int(np.floor(batch_size * supervised_ratio))
    u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio)))

    sampler_s = torch_data.SubsetRandomSampler(s_idx)
    sampler_u = torch_data.SubsetRandomSampler(u_idx)

    train_s_loader = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s)
    train_u_loader = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u)

    train_loader = ZipCycle([train_s_loader, train_u_loader])

    return None, train_loader, val_loader
예제 #23
0
def create_data_loader(config, idxs, shuffle=True):
    if shuffle:
        sampler = TD.SubsetRandomSampler(idxs)
        dataset = config.dataset
    else:
        sampler = TD.SequentialSampler(idxs)
        dataset = TD.Subset(config.dataset, idxs)
    return TD.DataLoader(
        dataset,
        batch_size=config.batch_size,
        sampler=sampler,
        pin_memory=True, num_workers=config.data_loader_num_workers
    )
예제 #24
0
def student_teacher(
        dataset_root,
        supervised_ratio: float = 1.0,
        batch_size: int = 128,
        train_transform: list = [],
        val_transform: list = [],
        **kwargs
):
    """
    Load the cifar10 dataset for student teacher framework.
    """
    pass
    # Prepare the default dataset
    train_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=True, download=True, transform=train_transform)
    val_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=False, download=True, transform=val_transform)

    # Split the training dataset into a supervised and unsupervised sets
    s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio)

    # Calc the size of the supervised and unsupervised batch
    nb_s_file = len(s_idx)
    nb_u_file = len(u_idx)

    ratio = nb_s_file / nb_u_file
    s_batch_size = int(np.floor(batch_size * ratio))
    u_batch_size = int(np.ceil(batch_size * (1 - ratio)))

    # Create the sample, the loader and zip them
    sampler_s = torch_data.SubsetRandomSampler(s_idx)
    sampler_u = torch_data.SubsetRandomSampler(u_idx)

    train_loader_s = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s)
    train_loader_u = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u)

    train_loader = ZipCycle([train_loader_s, train_loader_u])
    val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    return None, train_loader, val_loader
예제 #25
0
def cross_validation_split(data_set, sample_size, val_split, shuffle=True):
    random_seed = 42

    indices = list(range(sample_size))
    split = int(np.floor(val_split * sample_size))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_indices, valid_indices = indices[split:], indices[:split]

    train_sampler = data.SubsetRandomSampler(train_indices)
    valid_sampler = data.SubsetRandomSampler(valid_indices)

    t_loader = data.DataLoader(data_set,
                               batch_size=batch_size,
                               sampler=train_sampler)
    v_loader = data.DataLoader(data_set,
                               batch_size=batch_size,
                               sampler=valid_sampler)

    return t_loader, v_loader
def load_data(cifar=True):
    """
    load data and dataloader for pytorch
    SVHN or CIFAR
    """
    transform_test = transforms.Compose([transforms.ToTensor()])
    if cifar:
        testset = dset.CIFAR10(root='./data/cifar-10-batches-py/', train=False,
                               download=True, transform=transform_test)
    else:
        testset = dset.SVHN(root='../data/', split='test',
                            download=True, transform=transform_test)
    
    testloader = data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2,
                            sampler=data.SubsetRandomSampler(range(8500, 10000)))
    return testloader, testset
예제 #27
0
def supervised(
        dataset_root,
        supervised_ratio: float = 1.0,
        batch_size: int = 64,
        train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9),
        val_folds: tuple = (10, ),
        verbose=1,
        **kwargs,
):
    """
    Load the UrbanSound dataset for supervised systems.
    """
    audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio")
    metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata")

    all_folds = train_folds + val_folds

    # Create the dataset manager
    manager = DatasetManager(metadata_root,
                             audio_root,
                             folds=all_folds,
                             verbose=verbose)

    # validation subset
    val_dataset = Dataset(manager, folds=val_folds, cached=True)
    val_loader = torch_data.DataLoader(val_dataset,
                                       batch_size=batch_size,
                                       shuffle=True)

    # training subset
    train_dataset = Dataset(manager, folds=train_folds, cached=True)

    if supervised_ratio == 1.0:
        train_loader = torch_data.DataLoader(train_dataset,
                                             batch_size=batch_size,
                                             shuffle=True)

    else:
        s_idx, u_idx = train_dataset.split_s_u(supervised_ratio)

        # Train loader only use the s_idx
        sampler_s = torch_data.SubsetRandomSampler(s_idx)
        train_loader = torch_data.DataLoader(train_dataset,
                                             batch_size=batch_size,
                                             sampler=sampler_s)

    return manager, train_loader, val_loader
예제 #28
0
    def _split_sampler(self, split):
        if split == 0.0:
            return None, None

        idx_full = np.arange(self.n_samples)

        np.random.seed(self.seed)
        np.random.shuffle(idx_full)

        if isinstance(split, int):
            assert split > 0
            assert split < self.n_samples, "validation set size is configured to be larger than entire dataset."
            len_valid = split
        else:
            len_valid = int(self.n_samples * split)

        ##################
        # 制定了测试文件(valtest)就将其作为验证集,否则将训练集分拆出测试集
        if self.val_file:
            valid_idx = self.valid_idx
            train_idx = np.array([idx for idx in idx_full if idx not in valid_idx])
        else:
            valid_idx = idx_full[0:len_valid]
            train_idx = np.delete(idx_full, np.arange(0, len_valid))

        #######################
        weights_per_class = 1. / torch.tensor(self.emotion_nums, dtype=torch.float)
        weights = [0] * self.n_samples
        for idx in range(self.n_samples):
            if idx in valid_idx:
                weights[idx] = 0.
            else:
                label = self.dataset[idx][0]
                weights[idx] = weights_per_class[label]
        weights = torch.tensor(weights)
        train_sampler = data.WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)
    
        valid_sampler = data.SubsetRandomSampler(valid_idx)

        # turn off shuffle option which is mutually exclusive with sampler
        self.shuffle = False
        self.n_samples = len(train_idx)

        return train_sampler, valid_sampler
예제 #29
0
    def setup_data(self):
        # Initialize trainset
        dataroot = self.opts.dataroot
        _trainset = dset.Dataset(dataroot, 'train', pairs='annotated')
        self.trainloader = dat.DataLoader(_trainset,
                                          batch_size=self.opts.batch_size,
                                          shuffle=True,
                                          num_workers=4)

        # Use subset of train data
        if self.opts.train_size:  # if --N: override the __len__ method of the dataset so that only the first N items will be used

            def train_size(unused):
                return self.opts.train_size

            _trainset.__class__.__len__ = train_size

        # Initialize testset
        if self.opts.do_validation:  # Defatult True
            _testset = dset.Dataset(dataroot, 'test', pairs='annotated')
            if self.opts.split_zeroshot:  # Split testset into seen and zeroshot sets
                test_sets = zeroshot.Splitter(_trainset, _testset).split()
                self.testloaders = [
                    dat.DataLoader(data,
                                   batch_size=len(data),
                                   num_workers=NUM_WORKERS)
                    for data in test_sets
                ]
            else:  # Use a single (unified) testset
                testdata = dat.DataLoader(_testset,
                                          batch_size=len(_testset),
                                          num_workers=NUM_WORKERS)
                self.testloaders = [testdata]
            if self.opts.val:  # Use only x percent of the primary testset as validation (and don't use the rest at this time)
                dataset = self.testloaders[0].dataset
                n = int(len(dataset) * self.opts.val)
                sampler = dat.SubsetRandomSampler(torch.arange(n))
                self.testloaders[0] = dat.DataLoader(dataset,
                                                     batch_size=n,
                                                     sampler=sampler,
                                                     num_workers=NUM_WORKERS)
        else:  # if --noval
            self.testloaders = []
예제 #30
0
def load_supervised(
        dataset_root,
        supervised_ratio: float = 1.0,
        batch_size: int = 128,
        train_transform: list = [],
        val_transform: list = [],
        **kwargs
):
    """
    Load the cifar10 dataset for Deep Co Training system.
    """
    train_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=True, download=True, transform=train_transform)
    val_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=False, download=True, transform=val_transform)

    # Split the training dataset into a supervised and unsupervised sets
    s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio)

    sampler_s1 = torch_data.SubsetRandomSampler(s_idx)
    train_loader = torch_data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler_s1, num_workers=4, pin_memory=True, )
    val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, )

    return None, train_loader, val_loader