예제 #1
0
 def __init__(self,
              in_dir,
              healpixes,
              raytracing_out_dirs,
              aperture_size,
              n_data,
              features,
              stop_mean_std_early=False,
              n_cores=20):
     self.stop_mean_std_early = stop_mean_std_early
     self.n_datasets = len(healpixes)
     self.n_cores = n_cores
     datasets = []
     Y_list = []
     for i in range(self.n_datasets):
         graph_hp = CosmoDC2GraphHealpix(
             healpixes[i],
             in_dir,
             raytracing_out_dirs[i],
             aperture_size,
             n_data[i],
             features,
             n_cores=self.n_cores,
         )
         datasets.append(graph_hp)
         Y_list.append(graph_hp.Y)
     self.Y = pd.concat(Y_list, ignore_index=True).reset_index(drop=True)
     ConcatDataset.__init__(self, datasets)
     self.transform_X = None
     self.transform_Y = None
     self.transform_Y_local = None
예제 #2
0
def build_dataset(cfg, default_args=None):
    """Build a dataset from config dict.

    Args:
        cfg (dict): Config dict. It should at least contain the key "type".
        default_args (dict, optional): Default initialization arguments.
            Default: None.

    Returns:
        Dataset: The constructed dataset.
    """
    from .dataset_wrappers import RepeatDataset

    if isinstance(cfg, (list, tuple)):
        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
    elif cfg['type'] == 'ConcatDataset':
        dataset = ConcatDataset(
            [build_dataset(c, default_args) for c in cfg['datasets']])
    elif cfg['type'] == 'RepeatDataset':
        dataset = RepeatDataset(build_dataset(cfg['dataset'], default_args),
                                cfg['times'])
    elif isinstance(cfg.get('ann_file'), (list, tuple)):
        dataset = _concat_dataset(cfg, default_args)
    else:
        dataset = build_from_cfg(cfg, DATASETS, default_args)
    return dataset
예제 #3
0
 def __init__(self, corpora: List[Corpus], name: str = 'multicorpus'):
     self.corpora = corpora
     super(MultiCorpus, self).__init__(
         ConcatDataset([corpus.train for corpus in self.corpora]),
         ConcatDataset([corpus.dev for corpus in self.corpora]),
         ConcatDataset([corpus.test for corpus in self.corpora]),
         name=name)
예제 #4
0
def get_face_swap_iterators(bs):
    """DepthNet + GT <-> frontal GT faces"""
    filename_vgg = "data/vgg/vgg.h5"
    filename_celeba = "data/celeba/celebA.h5"
    filename_celeba_swap = "data/celeba_faceswap/celeba_faceswap.h5"
    a_train = H5Dataset(filename_celeba_swap, 'imgs', train=True)
    vgg_side_train = H5Dataset('%s' % filename_vgg, 'src_GT', train=True)
    vgg_frontal_train = H5Dataset('%s' % filename_vgg, 'tg_GT', train=True)
    celeba_side_train = H5Dataset('%s' % filename_celeba, 'src_GT', train=True)
    celeba_frontal_train = H5Dataset('%s' % filename_celeba,
                                     'tg_GT',
                                     train=True)
    b_train = ConcatDataset((vgg_side_train, vgg_frontal_train,
                             celeba_side_train, celeba_frontal_train))
    a_valid = H5Dataset(filename_celeba_swap, 'imgs', train=False)
    vgg_side_valid = H5Dataset('%s' % filename_vgg, 'src_GT', train=False)
    vgg_frontal_valid = H5Dataset('%s' % filename_vgg, 'tg_GT', train=False)
    celeba_side_valid = H5Dataset('%s' % filename_celeba,
                                  'src_GT',
                                  train=False)
    celeba_frontal_valid = H5Dataset('%s' % filename_celeba,
                                     'tg_GT',
                                     train=False)
    b_valid = ConcatDataset((vgg_side_valid, vgg_frontal_valid,
                             celeba_side_valid, celeba_frontal_valid))
    loader_train_a = DataLoader(a_train, batch_size=bs, shuffle=True)
    loader_train_b = DataLoader(b_train, batch_size=bs, shuffle=True)
    loader_valid_a = DataLoader(a_valid, batch_size=bs, shuffle=True)
    loader_valid_b = DataLoader(b_valid, batch_size=bs, shuffle=True)
    return loader_train_a, loader_train_b, loader_valid_a, loader_valid_b
    def get(cls, args, splits=('train', 'val', 'val_video')):
        newargs1 = copy.deepcopy(args)
        newargs2 = copy.deepcopy(args)
        vars(newargs1).update({
            'train_file': args.train_file.split(';')[0],
            'val_file': args.val_file.split(';')[0],
            'data': args.data.split(';')[0]
        })
        vars(newargs2).update({
            'train_file': args.train_file.split(';')[1],
            'val_file': args.val_file.split(';')[1],
            'data': args.data.split(';')[1]
        })

        if 'train' in splits or 'val' in splits:
            train_datasetego, val_datasetego, _ = CharadesEgoMeta.get(
                newargs1, splits=splits)
        else:
            train_datasetego, val_datasetego = None, None
        train_dataset, val_dataset, valvideo_dataset = super(
            CharadesEgoPlusCharades, cls).get(newargs2, splits=splits)

        if 'train' in splits:
            train_dataset.target_transform = transforms.Lambda(lambda x: -x)
            train_dataset = ConcatDataset(
                [train_dataset] +
                [train_datasetego] * 3)  # magic number to balance
        if 'val' in splits:
            val_dataset.target_transform = transforms.Lambda(lambda x: -x)
            val_dataset = ConcatDataset([val_dataset] + [val_datasetego] * 3)
        return train_dataset, val_dataset, valvideo_dataset
예제 #6
0
    def setup(self, stage):
        for dm in self.dms:
            dm.setup(stage)

        self.train_dataset = ConcatDataset(
            [dm.train_dataset for dm in self.dms])
        self.val_dataset = ConcatDataset([dm.val_dataset for dm in self.dms])
        self.test_dataset = ConcatDataset([dm.test_dataset for dm in self.dms])
        self.tokenizer = self.dms[0].tokenizer

        self.collate = functools.partial(
            self.dms[0].train_dataset.collate,
            mlm_collator=self.dms[0].mlm_collator,
        )

        if self.dist:
            self.train_sampler = DistributedSampler(self.train_dataset,
                                                    shuffle=True)
            self.val_sampler = DistributedSampler(self.val_dataset,
                                                  shuffle=True)
            self.test_sampler = DistributedSampler(self.test_dataset,
                                                   shuffle=False)
        else:
            self.train_sampler = None
            self.val_sampler = None
            self.test_sampler = None
예제 #7
0
 def get_all_sentences(self) -> Dataset:
     """Canasai's comment out:
     return ConcatDataset([self.train, self.dev, self.test])"""
     if self.train is not None and self.dev is not None and self.test is not None:
         return ConcatDataset([self.train, self.dev, self.test])
     elif self.train is not None and self.dev is not None:
         return ConcatDataset([self.train, self.dev])
     elif self.train is not None:
         return ConcatDataset([self.train])
     else:
         raise RuntimeError("Shouldn't be here")
예제 #8
0
    def setup(self, stage: Optional[str] = None) -> None:
        if stage == "test" or stage == "predict":
            raise NotImplemented

        train_dataset = ConcatDataset(list(map(self._get_dataset_from_path, self.train_paths)))
        if self.test_paths is None or len(self.test_paths) == 0:
            train_len = int(.95 * len(train_dataset))
            train_dataset, test_dataset = random_split(train_dataset, [train_len, len(train_dataset) - train_len])
        else:
            test_dataset = ConcatDataset(list(map(self._get_dataset_from_path, self.test_paths)))

        self.train_set, self.val_set = train_dataset, test_dataset
예제 #9
0
    def update_dataloaders(self, time):
        ## update idx_list for all minidata
        for key in self.datasets:
            for d_ in self.datasets[key].datasets:
                d_.update_idx_list(time)

        self.train = DataLoader(ConcatDataset(self.datasets['train'].datasets),
                                **self.dataLoader_kwargs)
        self.dev = DataLoader(ConcatDataset(self.datasets['dev'].datasets),
                              **self.dataLoader_kwargs)
        self.test = DataLoader(ConcatDataset(self.datasets['test'].datasets),
                               **self.dataLoader_kwargs)
예제 #10
0
def load_data_from_csv(train_csv, val_csv, input_size, transform_config):
    tsfm = create_transform(input_size, transform_config)

    train_set = [CSVDataset(csv, transform=tsfm['train']) for csv in train_csv]
    val_set = [CSVDataset(csv, transform=tsfm['val']) for csv in val_csv]

    train_dataset = ConcatDataset(train_set)
    val_dataset = ConcatDataset(val_set)

    train_labels = []
    for _, label in train_dataset:
        train_labels.append(label)
    train_sampler = weighted_sampler(train_labels)

    return train_dataset, val_dataset, train_sampler
예제 #11
0
    def test(self, test_data):
        self.discriminator.eval()
        dataset = ConcatDataset(test_data)
        data_loader = DataLoader(dataset, shuffle=True, batch_size=self.batch_size)
        total = len(dataset)
        correct = 0

        confusion_matrix = torch.zeros(self.trained_class_num, self.trained_class_num).type(torch.long)

        with torch.no_grad():
            for _, (index, x, y) in enumerate(data_loader):
                if self.use_gpu:
                    x, y = x.cuda(self.device_num), y.cuda(self.device_num)
                output = self.discriminator(x)
                label = self.softmax(output).argmax(dim=1)

                for n, m in zip(y.view(-1, 1), label.view(-1, 1)):
                    confusion_matrix[n, m] += 1

                correct += label.eq(y).long().cpu().sum() if self.use_gpu else label.eq(y).long().sum()
            confusion_matrix = confusion_matrix.numpy()
            df_cm = pd.DataFrame(confusion_matrix, index=[i for i in range(self.trained_class_num)],
                                 columns=[i for i in range(self.trained_class_num)])
            plt.xlabel('real label')
            plt.ylabel('classification result')
            plt.figure(figsize=(7 * self.trained_class_num // 10, 5 * self.trained_class_num // 10))
            sn.heatmap(df_cm, annot=True)
            plt.savefig('./confusion_matrix/e2e/' + str(self.trained_class_num) + '_heatmap.png', dpi=300)

        print("Accuracy: {}/{} ({:.2f}%)".format(correct, total, 100. * correct / total))
예제 #12
0
def build_dataset(dataset_list, dataset_catalog, is_train=True):
    """
    Arguments:
        dataset_list (list[str]): Contains the names of the datasets, i.e.,
            coco_2014_trian, coco_2014_val, etc
        dataset_catalog (DatasetCatalog): contains the information on how to
            construct a dataset.
        is_train (bool): whether to setup the dataset for training or testing
    """
    if not isinstance(dataset_list, (list, tuple)):
        raise RuntimeError(
            "dataset_list should be a list of strings, got {}".format(
                dataset_list))
    datasets = []
    for dataset_name in dataset_list:
        data = dataset_catalog.get(dataset_name)
        args = data["args"]
        args["is_train"] = is_train
        factory = globals()[data['factory']]
        # make dataset from factory
        dataset = factory(**args)
        datasets.append(dataset)

    # for testing, return a list of datasets
    if not is_train:
        return datasets

    # for training, concatenate all datasets into a single one
    dataset = datasets[0]
    if len(datasets) > 1:
        dataset = ConcatDataset(datasets)

    return [dataset]
 def __init__(self, pattern="/global_index_cluster_data.npy", root_dir='../results/VAE_fashion-mnist_64_62', transform=None, list_idx=[0], dsname="fashion-mnist", num_labels=10, num_cluster=5):
     """
     Args:
         pattern (string): file name of the npy file which stores the global index of each cluster/subdomain as a dictionary
         root_dir (string): Directory with all the images.
         transform (callable, optional): Optional transform to be applied
             on a sample.
         list_idx (list): the list of indexes of the cluster to choose as trainset or testset
         for example
         trainset = VGMMDataset(list_idx = [0,1, 2, 3])
         testset = VGMMDataset(list_idx = [4])
         dsname: currently dsname is fashion-mnist, but not used at all
     """
     self.root_dir = root_dir
     self.pattern = pattern
     self.transform = transform
     if not tf.gfile.Exists(self.root_dir + self.pattern):
         _, self.global_index = InputDataset.concatenate_data_from_dir(self.root_dir, num_labels=num_labels, num_clusters=num_cluster)
     else:
         self.global_index = np.load(self.root_dir + pattern, allow_pickle=True)
     self.list_idx = list_idx
     all_inds = []
     print('cluster index list:' + str(list_idx))
     for index in self.list_idx:  # iterate all **chosen** clusters/subdomains
         to_append = self.global_index.item().get(str(index))   # self.global_index is a dictionary of {'0': [15352, 2152,21, 25,...], '1':[1121, 1252, 3195,...]}
         print('\n size of cluster:' + str(np.shape(to_append)) + '\n')
         all_inds = np.append(all_inds, to_append)
         print(all_inds.shape)
     self.all_inds = all_inds.tolist()
     self.all_inds = [round(x) for x in self.all_inds]   # make to be integer # self.all_inds = map(round, self.all_inds)
     trainset_temp = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
     testset_temp = torchvision.datasets.FashionMNIST(root='./data', train=False, download=False, transform=transform)
     cd = ConcatDataset((trainset_temp, testset_temp))
     self.subset = torch.utils.data.Subset(cd, self.all_inds)
예제 #14
0
    def make_dataset(data, for_evaluate=False):
        padded_spectrogram_dataset = SpectrogramDataset(
            inputs=data,
            sampling_length=config.sampling_length,
            min_not_silence_length=int(config.min_not_silence_rate *
                                       config.sampling_length),
            padding_length=config.padding_length * 2,
        )

        spectrogram_dataset = SpectrogramDataset(
            inputs=data,
            sampling_length=config.sampling_length,
            min_not_silence_length=int(config.min_not_silence_rate *
                                       config.sampling_length),
            padding_length=0,
        )
        dataset = TrainDataset(
            padded_spectrogram_dataset=padded_spectrogram_dataset,
            spectrogram_dataset=spectrogram_dataset,
            latent_size=config.latent_size,
        )

        if for_evaluate:
            dataset = ConcatDataset([dataset] * config.evaluate_times)

        return dataset
예제 #15
0
 def join(self, other, info=None):
     if type(other) is not list:
         other = [other]
     datasets = [self] + other
     info = info or datasets[0].info
     name = f"concat[" + ",".join(x.name for x in datasets) + "]"
     return Dataset(ConcatDataset(datasets), info, name)
예제 #16
0
def _concat_dataset(cfg, default_args=None):
    types = cfg['type']
    ann_files = cfg['ann_file']
    img_prefixes = cfg.get('img_prefix', None)
    dataset_infos = cfg.get('dataset_info', None)

    num_joints = cfg['data_cfg'].get('num_joints', None)
    dataset_channel = cfg['data_cfg'].get('dataset_channel', None)

    datasets = []
    num_dset = len(ann_files)
    for i in range(num_dset):
        cfg_copy = copy.deepcopy(cfg)
        cfg_copy['ann_file'] = ann_files[i]

        if isinstance(types, (list, tuple)):
            cfg_copy['type'] = types[i]
        if isinstance(img_prefixes, (list, tuple)):
            cfg_copy['img_prefix'] = img_prefixes[i]
        if isinstance(dataset_infos, (list, tuple)):
            cfg_copy['dataset_info'] = dataset_infos[i]

        if isinstance(num_joints, (list, tuple)):
            cfg_copy['data_cfg']['num_joints'] = num_joints[i]

        if is_seq_of(dataset_channel, list):
            cfg_copy['data_cfg']['dataset_channel'] = dataset_channel[i]

        datasets.append(build_dataset(cfg_copy, default_args))

    return ConcatDataset(datasets)
예제 #17
0
def build_data_loader_dump(manifest_list,
                           batch_size=32,
                           num_workers=16,
                           shuffle=True,
                           drop_last=True,
                           rate_min=0.9,
                           rate_max=1.1,
                           n_mels=80,
                           hop_length=160,
                           win_length=400,
                           n_fft=512,
                           left_frames=0,
                           right_frames=0,
                           skip_frames=0,
                           vocab_path='testing_vocab.model',
                           min_duration=1,
                           max_duration=10,
                           given_rate=None):

    audio_sets = [
        AudioSet(file, rate_min, rate_max, n_mels, hop_length, win_length,
                 n_fft, left_frames, right_frames, skip_frames, vocab_path,
                 min_duration, max_duration, given_rate)
        for file in manifest_list
    ]
    dataset = ConcatDataset(audio_sets)
    dataloader = DataLoaderX(dataset,
                             batch_size,
                             shuffle,
                             num_workers=num_workers,
                             collate_fn=CollateFnDump(),
                             drop_last=drop_last)
    return dataloader
예제 #18
0
    def __init__(self, root, transforms=None, segments=None):
        """
        :param root: the dataset root directory
        :type root: str
        :param transform: the transformation to perform after loading the
               frames. A typical choice is
               ``torchvision.transforms.Totensor()`` followed by ``Normalize``.
        :param segments: None to concatenate all segments as if they were a
               single video; otherwise, specify the name of the segment to
               read; or a list of names to concatenate
        :type segments: Optional[Union[str, Sequence[str]]]
        """
        super(VideoSegmentDataset, self).__init__(root)
        self.transforms = transforms
        if isinstance(segments, str):
            segments = [segments]

        def in_segments(ds_name):
            if segments is None:
                return True
            return ds_name in segments

        self.segment_data = ConcatDataset(list(map(_SegmentWrapper,
                                                   map(self.h5file.get,
                                                       filter(in_segments,
                                                              self.h5file)))))
예제 #19
0
def build_dataset(cfg, stage):
    """
    Build dataset.
    if several datasets are defined
    in the dict cfg.*_data_loader.datasets then create ConcatDataset
    """
    assert stage in ["train", "val", "test"]
    key = LyftDataset.name_2_dataloader_key[stage]
    cfg = cfg.copy()
    dset_cfg = cfg[key]

    if "datasets" in dset_cfg:
        datasets = []
        for dset_name, params in dset_cfg.datasets.items():
            cur_cfg = cfg.copy()
            # we take only the subconfig with the corresponding name!
            OmegaConf.set_struct(cur_cfg, False)
            cur_cfg[key].update(params)
            OmegaConf.set_struct(cur_cfg, True)
            if cur_cfg[key].prerendered:
                dset_class = LyftDatasetPrerendered
            else:
                dset_class = LyftDataset
            datasets.append(dset_class(dset_name, cfg_data=cur_cfg))
        if len(datasets) > 1:
            return ConcatDataset(datasets)
        else:
            return datasets[0]
    else:
        if dset_cfg.prerendered:
            dset_class = LyftDatasetPrerendered
        else:
            dset_class = LyftDataset
        return dset_class(dset_cfg.dset_name, cfg_data=cfg)
예제 #20
0
def load_mnist(dataset_name, shuffle=True, seed=547):
    trainset_temp = torchvision.datasets.FashionMNIST(root='./data',
                                                      train=True,
                                                      download=True,
                                                      transform=transform)
    trX = trainset_temp.data
    trX = trX.reshape((60000, 28, 28, 1))
    trY = trainset_temp.targets
    testset_temp = torchvision.datasets.FashionMNIST(root='./data',
                                                     train=False,
                                                     download=False,
                                                     transform=transform)
    teX = testset_temp.data
    teX = teX.reshape((10000, 28, 28, 1))
    teY = testset_temp.targets
    cd = ConcatDataset((trainset_temp, testset_temp))
    #return cd.data, cd.targets
    X = np.concatenate((trX, teX), axis=0)
    y = np.concatenate((trY, teY), axis=0).astype(np.int)
    yy = np.zeros((len(y), 10))
    yy[np.arange(len(y)), y] = 1
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(X)
        np.random.seed(seed)
        np.random.shuffle(yy)
    return X / 255., yy
예제 #21
0
    def prepare_data(self, datasets):
        """Deal with making data ready for consumption.
        
        Parameters
        ---
        datasets: Dict[str, List of dataset names]

        Returns:
            tuple:

        """
        # train_datasets = {dataset_name: dataset for dataset_name, dataset in zip(datasets["order"], datasets["train"])}
        train_datasets = datasets_dict(datasets["train"], datasets["order"])
        val_datasets = datasets_dict(datasets["test"], datasets["order"])
        eval_dataset = val_datasets[self.config.testing.eval_dataset]
        # split into training and testing point, assumes there is no meaningful difference in dataset order
        eval_train_dataset = eval_dataset.new(0, self.config.testing.n_samples)
        eval_eval_dataset = eval_dataset.new(self.config.testing.n_samples, -1)
        # sample a subset so validation doesn't take too long
        eval_eval_dataset = eval_eval_dataset.sample(min(self.config.testing.few_shot_validation_size, len(eval_dataset)))

        if self.config.data.alternating_order:
            order, n_samples = alternating_order(train_datasets, tasks=self.config.data.alternating_tasks,
                                                 n_samples_per_switch=self.config.data.alternating_n_samples_per_switch,
                                                 relative_frequencies=self.config.data.alternating_relative_frequencies)
        else:
            n_samples, order = n_samples_order(self.config.learner.samples_per_task, self.config.task_order, datasets["order"])
        datas = get_continuum(train_datasets, order=order, n_samples=n_samples,
                             eval_dataset=self.config.testing.eval_dataset, merge=False)
        # for logging extra things
        self.extra_dataloader = iter(DataLoader(ConcatDataset(train_datasets.values()), batch_size=self.mini_batch_size, shuffle=True))
        return datas, order, n_samples, eval_train_dataset, eval_eval_dataset, eval_dataset
예제 #22
0
파일: data.py 프로젝트: vedangj044/flair
    def __init__(self, corpora: List[Corpus], name: str = "multicorpus", **corpusargs):
        self.corpora: List[Corpus] = corpora

        train_parts = []
        dev_parts = []
        test_parts = []
        for corpus in self.corpora:
            if corpus.train: train_parts.append(corpus.train)
            if corpus.dev: dev_parts.append(corpus.dev)
            if corpus.test: test_parts.append(corpus.test)

        super(MultiCorpus, self).__init__(
            ConcatDataset(train_parts) if len(train_parts) > 0 else None,
            ConcatDataset(dev_parts) if len(dev_parts) > 0 else None,
            ConcatDataset(test_parts) if len(test_parts) > 0 else None,
            name=name,
            **corpusargs,
        )
예제 #23
0
파일: train.py 프로젝트: Freesail/plankton
def train_model_crossval(data_transforms,
                         kfold_dir,
                         train_cfg,
                         model_cfg,
                         optimizer_cfg,
                         scheduler_cfg,
                         loss_fn=nn.CrossEntropyLoss(reduction='none'),
                         cv=True,
                         pseudo_scheduler=None,
                         pseudo_dir=None,
                         target_transform=None):
    kfold_datasets = []
    for k in safe_listdir(kfold_dir):
        kfold_datasets.append(datasets.ImageFolder(os.path.join(kfold_dir, k)))

    kfold_result = []

    K = len(kfold_datasets)
    for i in range(K):
        print('K_Fold CV {}/{}'.format(i + 1, K))
        print('=' * 10)
        train_sets = kfold_datasets[:i] + kfold_datasets[i + 1:]
        for s in train_sets:
            s.transform = data_transforms['train']
            if target_transform is not None:
                s.target_transform = target_transform
        val_set = kfold_datasets[i]
        val_set.transform = data_transforms['val']
        image_datasets = {'train': ConcatDataset(train_sets), 'val': val_set}
        if pseudo_scheduler is not None:
            pseudo_set = datasets.ImageFolder(pseudo_dir)
            pseudo_set.transform = data_transforms['val']
            image_datasets['pseudo'] = pseudo_set

        class_names, dataloaders, dataset_sizes = helper_dataloaders(
            image_datasets, train_cfg['batch_size'])
        model, optimizer, scheduler = \
            helper_train(model_cfg, optimizer_cfg, scheduler_cfg)

        result = \
            train_model(class_names, dataset_sizes, dataloaders,
                        model, loss_fn, optimizer, scheduler,
                        model_cfg['device'],
                        train_cfg['num_epochs'],
                        train_cfg['batch_per_disp'],
                        pseudo_scheduler=pseudo_scheduler)

        kfold_result.append(result)

        ckpoint = {'kfold_result': kfold_result, 'class_names': class_names}
        torch.save(ckpoint, 'ckpoint.pt')

        if not cv:
            break

    return ckpoint
예제 #24
0
def get_dataset(replay_folder, cache_folder, limit, name=None):
    if name is not None:
        name = os.path.join(cache_folder, name)
        if os.path.exists(name):
            return torch.load(name)

    if replay_folder is None:
        files = [(dp, f) for dp, dn, fn in os.walk(cache_folder) for f in fn
                 if f.endswith(".pickle")]
    else:
        files = [(dp, f) for dp, dn, fn in os.walk(replay_folder) for f in fn
                 if f.endswith(".replay")]

    file_iter = tqdm.tqdm(enumerate(files[:limit]),
                          desc="Load",
                          total=limit,
                          bar_format="{l_bar}{r_bar}")

    datasets = []
    for i, (dp, f) in file_iter:
        try:
            if replay_folder is None:
                out_path = os.path.join(dp, f)
                with open(out_path, "rb") as handle:
                    dfs = pickle.load(handle)
            else:
                in_path = os.path.join(dp, f)
                out_path = os.path.join(cache_folder, f[:-7] + ".pickle")
                if os.path.exists(out_path):
                    with open(out_path, "rb") as handle:
                        dfs = pickle.load(handle)
                else:
                    dfs = replay_to_dfs(in_path)
                    with open(out_path, "wb") as handle:
                        pickle.dump(dfs, handle)

            x_n, y_n = convert_dfs(dfs, tensors=True)
            assert x_n[2].shape == x_n[3].shape
            normalize(x_n)

            swap_teams(x_n, y_n, slice(i % 2, None, 2))

            datasets.append(TensorDataset(*x_n, *y_n))

            # x_s, y_s = [v.copy() for v in x_n], [v.copy() for v in y_n]
            # swap_teams(x_s, y_s)

            # arrays.append((x_s, y_s))
        except Exception as e:
            print(e)
            pass

    ds = ConcatDataset(datasets)
    if name is not None:
        torch.save(ds, name)
    return ds
예제 #25
0
    def tdt_split(self):
        length = self.df.shape[0]
        end_train = int(length * self.split[0])
        start_dev = end_train
        end_dev = int(start_dev + length * self.split[1])
        start_test = end_dev

        if 'split' in self.df.columns:
            df_train = self.df[self.df['split'] == 'train']
            df_dev = self.df[self.df['split'] == 'dev']
            df_test = self.df[self.df['split'] == 'test']
        else:
            df_train = self.df[:end_train]
            df_dev = self.df[start_dev:end_dev]
            df_test = self.df[start_test:]

        minidataKwargs = {
            'stft_window': self.stft_window,
            'stft_hop': self.stft_hop,
            'n_fft': self.n_fft,
            'hop_length': self.hop_length,
            'win_length': self.win_length
        }

        dataset_train = ConcatDataset([
            MiniData(row.wav, **minidataKwargs)
            for i, row in tqdm(df_train.iterrows())
        ])
        dataset_dev = ConcatDataset([
            MiniData(row.wav, **minidataKwargs)
            for i, row in tqdm(df_dev.iterrows())
        ])
        dataset_test = ConcatDataset([
            MiniData(row.wav, **minidataKwargs)
            for i, row in tqdm(df_test.iterrows())
        ])

        return {
            'train': dataset_train,
            'dev': dataset_dev,
            'test': dataset_test
        }
예제 #26
0
파일: finetune.py 프로젝트: Manikant92/KILT
    def get_dataloader(self,
                       type_path: str,
                       batch_size: int,
                       shuffle: bool = False) -> DataLoader:
        datasets = []
        for d in self.dataset_list:
            datasets.append(
                KiltDataset(self.tokenizer, self.data_dir, d, type_path,
                            self.source_length, self.target_length,
                            self.output_dir))
        if type_path == 'dev':
            for x in datasets:
                self.devsets.update(x.id_targets)
        concat_dataset = ConcatDataset(datasets)
        dataloader = DataLoader(concat_dataset,
                                batch_size=batch_size,
                                shuffle=shuffle,
                                collate_fn=self.collate_fn)

        print(type_path, dataloader.batch_size, concat_dataset.__len__())
        return dataloader
 def __init__(self, indices, transform=None):
     trainset_temp = torchvision.datasets.FashionMNIST(root='./data',
                                                       train=True,
                                                       download=True,
                                                       transform=transform)
     testset_temp = torchvision.datasets.FashionMNIST(root='./data',
                                                      train=False,
                                                      download=False,
                                                      transform=transform)
     cd = ConcatDataset((trainset_temp, testset_temp))
     #trainloader, testloader = _make_dataloaders(cd, trainsetsize, testsetsize, batch_size)
     self.subset = torch.utils.data.Subset(cd, indices)
예제 #28
0
    def _dataset(fns, for_test=False):
        inputs = [
            LazyInput(
                phoneme_list_path=phoneme_list_paths[fn],
                start_accent_list_path=start_accent_list_paths[fn],
                end_accent_list_path=end_accent_list_paths[fn],
                start_accent_phrase_list_path=start_accent_phrase_list_paths[
                    fn],
                end_accent_phrase_list_path=end_accent_phrase_list_paths[fn],
                f0_path=f0_paths[fn],
                volume_path=volume_paths[fn],
                phoneme_class=phoneme_type_to_class[config.phoneme_type],
            ) for fn in fns
        ]

        if not for_test:
            dataset = FeatureDataset(
                inputs=inputs,
                sampling_length=config.sampling_length,
                f0_process_mode=F0ProcessMode(config.f0_process_mode),
                phoneme_mask_max_length=config.phoneme_mask_max_length,
                phoneme_mask_num=config.phoneme_mask_num,
                accent_mask_max_length=config.accent_mask_max_length,
                accent_mask_num=config.accent_mask_num,
                f0_mask_max_length=config.f0_mask_max_length,
                f0_mask_num=config.f0_mask_num,
            )
        else:
            dataset = FeatureDataset(
                inputs=inputs,
                sampling_length=config.sampling_length,
                f0_process_mode=F0ProcessMode(config.f0_process_mode),
                phoneme_mask_max_length=0,
                phoneme_mask_num=0,
                accent_mask_max_length=0,
                accent_mask_num=0,
                f0_mask_max_length=0,
                f0_mask_num=0,
            )

        if speaker_ids is not None:
            dataset = SpeakerFeatureDataset(
                dataset=dataset,
                speaker_ids=[speaker_ids[fn] for fn in fns],
            )

        dataset = TensorWrapperDataset(dataset)

        if for_test:
            dataset = ConcatDataset([dataset] * config.test_trial_num)

        return dataset
예제 #29
0
    def update_modules(self, trainloader, task_id):
        self.net.freeze_modules(freeze=False)
        self.net.freeze_structure(freeze=True)
        prev_reduction = self.loss.reduction
        self.loss.reduction = 'sum'  # make sure the loss is summed over instances

        tmp_dataset = copy.copy(trainloader.dataset)
        tmp_dataset.tensors = tmp_dataset.tensors + (torch.full(
            (len(tmp_dataset), ), task_id, dtype=int), )
        mega_dataset = ConcatDataset(
            [loader.dataset
             for loader in self.memory_loaders.values()] + [tmp_dataset])
        tmp_loader = next(iter(self.memory_loaders.values()))
        batch_size = tmp_loader.batch_size
        mega_loader = torch.utils.data.DataLoader(mega_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=0,
                                                  pin_memory=True)
        for X, Y, t in mega_loader:
            X = X.to(self.net.device, non_blocking=True)
            Y = Y.to(self.net.device, non_blocking=True)
            l = 0.
            n = 0
            all_t = torch.unique(t)
            for task_id_tmp in all_t:
                Y_hat = self.net(X[t == task_id_tmp], task_id=task_id_tmp)
                l += self.loss(Y_hat, Y[t == task_id_tmp])
                n += X.shape[0]
            l /= n
            self.optimizer.zero_grad()
            l.backward()
            self.optimizer.step()

            l = 0.
            n = 0
            self.net.hide_tmp_module()
            for task_id_tmp in all_t:
                Y_hat = self.net(X[t == task_id_tmp], task_id=task_id_tmp)
                l += self.loss(Y_hat, Y[t == task_id_tmp])
                n += X.shape[0]
            l /= n
            self.optimizer.zero_grad()
            l.backward()
            self.optimizer.step()
            self.net.recover_hidden_module()

        self.loss.reduction = prev_reduction
        self.net.freeze_modules(freeze=True)
        self.net.freeze_structure(
            freeze=False,
            task_id=task_id)  # unfreeze only current task's structure
예제 #30
0
def test_dataset_transform_override():
    # given
    data1 = MemoryDataset({
        'x': [pic(1), pic(2), pic(3)], 'y': ['a', 'b', 'c']
    }, transform=Lambda(lambda x: np.array(x)[0, 0] * 2))
    data2 = MemoryDataset({
        'x': [pic(4), pic(5), pic(6)], 'y': ['d', 'e', 'f']
    }, transform=Lambda(lambda x: np.array(x)[0, 0] * 3))
    data3 = MemoryDataset({
        'x': [pic(7), pic(8), pic(9)], 'y': ['g', 'h', 'i']
    }, transform=Lambda(lambda x: np.array(x)[0, 0] + 10))
    ds = ConcatDataset([data1, ConcatDataset([data2, data3])])

    # when
    x1, y1 = zip(*[ds[i] for i in range(len(ds))])
    with override_dataset_transform(ds, Lambda(lambda x: np.array(x)[0, 0])) as ds_overriden:
        x2, y2 = zip(*[ds_overriden[i] for i in range(len(ds_overriden))])
    x3, y3 = zip(*[ds[i] for i in range(len(ds))])

    # then
    assert np.array_equal(x1, [2, 4, 6, 12, 15, 18, 17, 18, 19])
    assert np.array_equal(x2, [1, 2, 3, 4, 5, 6, 7, 8, 9])
    assert np.array_equal(x3, x1)  # after everything is back to normal