Exemplo n.º 1
0
 def __init__(self,
              data: DataBase,
              replacement: bool = False,
              num_samples: Optional[int] = None):
     super().__init__(data)
     self._sampler = torch_sampler.RandomSampler(data, replacement,
                                                 num_samples)
Exemplo n.º 2
0
def visualize_data(dataset,
                   imagedir='runtime-images',
                   filename='sample',
                   num_samples=10):
    """
    Displays random samples from the given dataset in a grid.
    """
    randomSampler = sampler.RandomSampler(dataset,
                                          True,
                                          num_samples=num_samples)
    my_path = os.path.join(os.path.dirname(__file__), imagedir)
    for index in randomSampler:
        (image, label) = dataset[index]
        plt.figure()
        num_channels = image.shape[0]
        if num_channels == 1:
            image = image.squeeze()
        else:
            image = image.view(image.shape[1], image.shape[2], image.shape[0])
        plt.imshow(image.numpy())
        plt.savefig(
            os.path.join(my_path, filename + str(index) + '.png'),
            bbox_inches='tight',
            cmap='gray',
        )
    plt.close()
Exemplo n.º 3
0
    def generate_sampler(dataset, sampler_option="random", n_bins=5):
        df = dataset.df
        n_labels = df[dataset.label].nunique()
        count = np.zeros(n_labels)

        for idx in df.index:
            label = df.loc[idx, dataset.label]
            key = dataset.label_fn(label)
            count[key] += 1

        weight_per_class = 1 / np.array(count)
        weights = []

        for idx, label in enumerate(df[dataset.label].values):
            key = dataset.label_fn(label)
            weights += [weight_per_class[key]] * dataset.elem_per_image

        if sampler_option == "random":
            return sampler.RandomSampler(weights)
        elif sampler_option == "weighted":
            return sampler.WeightedRandomSampler(weights, len(weights))
        else:
            raise NotImplementedError(
                f"The option {sampler_option} for sampler on classification task is not implemented"
            )
Exemplo n.º 4
0
    def make_loader(self,
                    batch_size=16,
                    num_workers=0,
                    shuffle=False,
                    pin_memory=False,
                    resize_rate=10,
                    drop_last=False):
        """
        CommandLine:
            python ~/code/netharn/examples/yolo_voc.py YoloVOCDataset.make_loader

        Example:
            >>> # DISABLE_DOCTSET
            >>> torch.random.manual_seed(0)
            >>> self = YoloVOCDataset(split='train')
            >>> self.augmenter = None
            >>> loader = self.make_loader(batch_size=1, shuffle=True)
            >>> # training batches should have multiple shapes
            >>> shapes = set()
            >>> for batch in ub.ProgIter(iter(loader), total=len(loader)):
            >>>     inputs, labels = batch
            >>>     # test to see multiscale works
            >>>     shapes.add(inputs.shape[-1])
            >>>     if len(shapes) > 1:
            >>>         break
            >>> assert len(shapes) > 1
        """
        import torch.utils.data.sampler as torch_sampler
        assert len(self) > 0, 'must have some data'
        if shuffle:
            sampler = torch_sampler.RandomSampler(self)
            resample_freq = resize_rate
        else:
            sampler = torch_sampler.SequentialSampler(self)
            resample_freq = None

        # use custom sampler that does multiscale training
        batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler(
            sampler,
            batch_size=batch_size,
            resample_freq=resample_freq,
            drop_last=drop_last,
        )
        # torch.utils.data.sampler.WeightedRandomSampler
        loader = torch_data.DataLoader(
            self,
            batch_sampler=batch_sampler,
            collate_fn=nh.data.collate.padded_collate,
            num_workers=num_workers,
            pin_memory=pin_memory)
        if loader.batch_size != batch_size:
            try:
                # Hack: ensure dataloader has batch size attr
                loader._DataLoader__initialized = False
                loader.batch_size = batch_size
                loader._DataLoader__initialized = True
            except Exception:
                pass
        return loader
Exemplo n.º 5
0
    def make_loader(self, batch_size=16, num_workers=0, shuffle=False,
                    pin_memory=False):
        """
        Example:
            >>> torch.random.manual_seed(0)
            >>> dset = coco_api.CocoDataset(coco_api.demo_coco_data())
            >>> self = YoloCocoDataset(dset, train=1)
            >>> loader = self.make_loader(batch_size=1)
            >>> train_iter = iter(loader)
            >>> # training batches should have multiple shapes
            >>> shapes = set()
            >>> for batch in train_iter:
            >>>     shapes.add(batch[0].shape[-1])
            >>>     if len(shapes) > 1:
            >>>         break
            >>> #assert len(shapes) > 1

            >>> vali_loader = iter(loaders['vali'])
            >>> vali_iter = iter(loaders['vali'])
            >>> # vali batches should have one shape
            >>> shapes = set()
            >>> for batch, _ in zip(vali_iter, [1, 2, 3, 4]):
            >>>     shapes.add(batch[0].shape[-1])
            >>> assert len(shapes) == 1
        """
        assert len(self) > 0, 'must have some data'
        if shuffle:
            if True:
                # If the data is not balanced we need to balance it
                index_to_weight = self._training_sample_weights()
                num_samples = len(self)
                index_to_weight = index_to_weight[:num_samples]
                sampler = torch_sampler.WeightedRandomSampler(index_to_weight,
                                                              num_samples,
                                                              replacement=True)
                sampler.data_source = self  # hack for use with multiscale
            else:
                sampler = torch_sampler.RandomSampler(self)
            resample_freq = 10
        else:
            sampler = torch_sampler.SequentialSampler(self)
            resample_freq = None

        # use custom sampler that does multiscale training
        batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler(
            sampler, batch_size=batch_size, resample_freq=resample_freq,
        )
        # torch.utils.data.sampler.WeightedRandomSampler
        loader = torch_data.DataLoader(self, batch_sampler=batch_sampler,
                                       collate_fn=nh.data.collate.padded_collate,
                                       num_workers=num_workers,
                                       pin_memory=pin_memory)
        if loader.batch_size != batch_size:
            try:
                loader.batch_size = batch_size
            except Exception:
                pass
        return loader
Exemplo n.º 6
0
def data_sampler(dataset, shuffle, distributed):
    if distributed:
        return DistributedSampler(dataset, shuffle=shuffle)

    if shuffle:
        return sampler.RandomSampler(dataset)

    else:
        return sampler.SequentialSampler(dataset)
Exemplo n.º 7
0
def main():

    data_dir = sys.argv[1]
    hero2ix_dir = sys.argv[2]

    # import DataFrame and hero2ix dictionary
    heroes_df = pd.read_csv(data_dir, index_col=0)
    hero2ix_df = pd.read_csv(hero2ix_dir, index_col=0)
    heroes_df = heroes_df.dropna().reset_index(drop=True)
    hero2ix = dict(zip(hero2ix_df.hero, hero2ix_df.ID))
    # heroes = hero2ix_df['hero'].values

    # train test split
    split = int(len(heroes_df)*0.9)
    heroes_train = heroes_df.iloc[:split]
    heroes_test = heroes_df.iloc[split:]

    # build dataset generator
    train_gen = DataFrameIterator(heroes_train, hero2ix)
    test_gen = DataFrameIterator(heroes_test, hero2ix)

    # Use Dataloader class in pytorch to generate batched data
    batch_size = 16
    loader_train = DataLoader(train_gen, batch_size=batch_size,
                              sampler=sampler.RandomSampler(train_gen),
                              num_workers=4)
    loader_test = DataLoader(test_gen, batch_size=batch_size,
                              sampler=sampler.SequentialSampler(test_gen),
                              num_workers=4)

    # define model, totally three models in hetor2vec.py
    model = CBOH(embedding_dim=10, heropool_size=len(hero2ix))

    # define loss function
    loss_function = nn.CrossEntropyLoss()

    # run train
    losses = train(model=model, dataloader=loader_train, loss_function=loss_function,
                   init_lr=0.1, epochs=20, lr_decay_epoch=8, print_epoch=2, gpu=False)

    # check test accuracy
    print('accuracy: ', accuracy(model, dataloader=loader_test,
                                 batch_size=batch_size, gpu=False))

    # save embeddings as numpy arrays
    output_dir = './output/hero/hero_embeddings.npy'
    save_embeddings(model, filename=output_dir)

    # pickle model
    pickle_dir = './output/hero/model.p'
    pickle.dump(obj=model, file=open(pickle_dir, 'wb'))

    # plot loss vs epoch
    plot_loss(losses, './output/hero/loss_hitory.png')

    # project embeddings to 2d plane
    plot_embeddings(model, hero2ix)
Exemplo n.º 8
0
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_sort = opt.get('pytorch_teacher_batch_sort')
        self.batch_cache_type = opt.get('batch_sort_cache')
        self.batch_sort_field = opt.get('batch_sort_field')
        # One can specify a collate function to use for preparing a batch
        self.opt = opt.copy()
        self.is_shared = shared is not None
        dataset_classes = self.get_dataset_class(opt)
        self.ordered = ('ordered' in self.datatype or
                        ('stream' in self.datatype and not opt.get('shuffle')))

        if not shared:
            if len(dataset_classes) > 1:
                datasets = []
                for class_name, collate_fn, task_name in dataset_classes:
                    opt['pytorch_teacher_task'] = task_name
                    opt['task'] = task_name
                    datasets.append(class_name(opt))
                    self.collate_fn = collate_fn
                self.dataset = ParlAIConcatDataset(datasets)
            else:
                class_name, self.collate_fn, task_name = dataset_classes[0]
                self.dataset = class_name(opt)
            if self.ordered or not self.training:
                data_sampler = sampler.SequentialSampler(self.dataset)
                pin_memory = False
            else:
                data_sampler = sampler.RandomSampler(self.dataset)
                pin_memory = True

            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=pin_memory,
                drop_last=False,
            )

            self.lastYs = [None] * self.bsz
            if self.batch_sort:
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Exemplo n.º 9
0
 def __init__(self, dataset, batch_size, shuffle = True, drop_last = False):
     # buckets list 根据contexts长度分组
     self.buckets = bucket(dataset)  
     # 打乱 list
     if shuffle:
         np.random.shuffle(self.buckets)
         random_samplers = [sampler.RandomSampler(bucket) for bucket in self.buckets]
     else:
         random_samplers = [sampler.SequentialSampler(bucket) for bucket in self.buckets]
     self.sampler = [sampler.BatchSampler(s, batch_size, drop_last) for s in random_samplers]
Exemplo n.º 10
0
def get_dataloaders(train_batchsize, val_batchsize):
  kwargs={
    'num_workers': 20,
    'pin_memory': True
  }
  input_size = INFO['model-info']['input-size']
  base = '{}/{}'.format(os.environ['datadir-base'], INFO['dataset'])
  normalize = T.Normalize(mean=INFO['dataset-info']['normalization']['mean'], std=INFO['dataset-info']['normalization']['std'])
  transform = {
    'train': T.Compose([
      T.Resize(tuple([int(x*(4/3)) for x in input_size])), # 放大
      T.RandomResizedCrop(input_size), # 随机裁剪后resize
      T.RandomHorizontalFlip(0.5), # 随机水平翻转
      T.RandomVerticalFlip(0.5), # 随机垂直翻转
      T.RandomApply([T.RandomRotation(90)], 0.5), # 随机旋转90/270度
      T.RandomApply([T.RandomRotation(180)], 0.25), # 随机旋转180度
      T.RandomApply([T.ColorJitter(brightness=np.random.random()/5+0.9)], 0.5), #随机调整图像亮度
      T.RandomApply([T.ColorJitter(contrast=np.random.random()/5+0.9)], 0.5), # 随机调整图像对比度
      T.RandomApply([T.ColorJitter(saturation=np.random.random()/5+0.9)], 0.5), # 随机调整图像饱和度
      T.ToTensor(),
      normalize
    ]), 
    'val': T.Compose([
      T.Resize(input_size), # 放大
      T.ToTensor(),
      normalize
    ])
  }
  train_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['train'])
  train4val_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['val'])
  val_dset = dset.ImageFolder('{}/{}'.format(base, 'Val'), transform=transform['val'])

  labels = torch.from_numpy(np.array(train_dset.imgs)[:, 1].astype(int))
  num_of_images_by_class = torch.zeros(len(train_dset.classes))
  for i in range(len(train_dset.classes)):
    num_of_images_by_class[i] = torch.where(labels == i, torch.ones_like(labels), torch.zeros_like(labels)).sum().item()

  mapping = {}
  for c in train_dset.classes:
    if c in val_dset.classes:
      mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx[c]
    else:
      mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx['UNKNOWN']
  mapping[-1] = val_dset.class_to_idx['UNKNOWN']

  train_len = train_dset.__len__()
  val_len = val_dset.__len__()

  train_loader = DataLoader(train_dset, batch_size=train_batchsize, sampler=sampler.RandomSampler(range(train_len)), **kwargs)
  train4val_loader = DataLoader(train4val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(train_len)), **kwargs)
  val_loader = DataLoader(val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(val_len)), **kwargs)

  imgs = np.array(val_dset.imgs)

  return train_loader, train4val_loader, val_loader, num_of_images_by_class, mapping, imgs
Exemplo n.º 11
0
    def generate_sampler(dataset, sampler_option="random", n_bins=5):
        df = dataset.df

        weights = [1] * len(df) * dataset.elem_per_image

        if sampler_option == "random":
            return sampler.RandomSampler(weights)
        else:
            raise NotImplementedError(
                f"The option {sampler_option} for sampler on reconstruction task is not implemented"
            )
Exemplo n.º 12
0
 def setup_dataloader(self, mode):
     # dataset[train] or dataset[test] ,batch_size ,sample ,dataloader
     dataset = self.dataset[mode]  # VOCDataset object
     # Difficult to understand how it work
     data_loader = dataloader.DataLoader(
         dataset,
         batch_size=self.config.BATCH_SIZE[mode],
         sampler=sampler.RandomSampler(dataset),
         collate_fn=self.collate_fn)
     #print(data_loader)
     return data_loader
Exemplo n.º 13
0
def prepare_dataloader():
    """Make data loaders for train and dev"""
    global args
    logger.info('-' * 100)
    logger.info('Loading Datasets...')
    train_ex = utils.load_data(args.train_file)
    logger.info('{} train examples loaded'.format(len(train_ex)))
    test_ex = utils.load_data(args.test_file)
    logger.info('{} test examples loaded'.format(len(test_ex)))

    logger.info('Building feature dictionary...')
    feature_dict = utils.build_feature_dict(train_ex)
    logger.info('Num features = %d' % len(feature_dict))
    logger.info(feature_dict)

    logger.info('Build word dictionary...')
    word_dict = utils.build_word_dict(train_ex + test_ex)
    logger.info('Num words = %d' % len(word_dict))
    args.vocab_size = len(word_dict)

    logger.info('-' * 100)
    logger.info('Creating DataLoaders')
    if args.cuda:
        kwargs = {'num_workers': 0, 'pin_memory': True}
    else:
        kwargs = {'num_workers': args.data_workers}
    train_dataset = data.ImdbDataset(args, train_ex, word_dict, feature_dict)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              sampler=sampler.RandomSampler(train_dataset),
                              **kwargs)
    test_dataset = data.ImdbDataset(args, test_ex, word_dict, feature_dict)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             sampler=sampler.RandomSampler(test_dataset),
                             **kwargs)
    return train_loader, test_loader, word_dict, feature_dict
Exemplo n.º 14
0
def main():
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print(f'Using device {device}')

    model = models.resnet18(pretrained=True)
    model = model.to(device=device)
    num_ftrs = model.fc.in_features
    model.fc = torch.nn.Linear(num_ftrs, NUM_AGE_BUCKETS).to(device=device)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5797703, 0.43427974, 0.38307136],
                             [0.25409877, 0.22383073, 0.21819368]),
    ])
    dataset = ChaLearnDataset(
        ['ChaLearn/images/test_1', 'ChaLearn/images/test_2'],
        'ChaLearn/gt/test_gt.csv',
        transform,
    )
    loader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        num_workers=DATA_LOADER_NUM_WORKERS,
        sampler=sampler.RandomSampler(dataset),
    )

    # Test and write the results to a file.
    with torch.no_grad():
        with open(OUTPUT_FILE_NAME, 'w') as output:
            for x, file_names in loader:
                x = x.to(device=device)
                scores = model(x)
                num_classes = scores.size(1)
                predicted_ages = (
                    (F.softmax(scores, dim=1) *
                     torch.arange(end=num_classes).to(device=device)).sum(
                         dim=1))
                lines = [
                    f'{file_name},{age}\n'
                    for file_name, age in zip(file_names, predicted_ages)
                ]
                output.writelines(lines)
Exemplo n.º 15
0
 def __init__(self,
              data_source,
              shuffle=False,
              batch_size=16,
              drop_last=False,
              resample_frequency=10):
     if shuffle:
         self.sampler = torch_sampler.RandomSampler(data_source)
     else:
         self.sampler = torch_sampler.SequentialSampler(data_source)
     self.shuffle = shuffle
     self.batch_size = batch_size
     self.drop_last = drop_last
     self.num_scales = len(data_source.multi_scale_inp_size)
     self.resample_frequency = resample_frequency
Exemplo n.º 16
0
def evaluate(model, test_dataset):
    model.eval()

    criterion = tnn.MSELoss()
    test_sampler = tsampler.RandomSampler(test_dataset)
    test_loader = tdata.DataLoader(test_dataset,
                                   batch_size=len(test_dataset),
                                   sampler=test_sampler)
    d = next(iter(test_loader))
    data_var = _make_variable(d[0], volatile=True)
    target_var = _make_variable(d[1], volatile=True)
    output_f, output_g = model.forward(data_var)
    loss = criterion(output_g, target_var)

    return loss
Exemplo n.º 17
0
    def __init__(self,
                 input_tensor,
                 input_lengths,
                 labels_tensor,
                 batch_size,
                 sequence_lenght=2665):
        self.input_tensor = input_tensor
        self.input_lengths = input_lengths
        self.labels_tensor = labels_tensor
        self.batch_size = batch_size
        self.sequence_length = 2665

        self.sampler = splr.BatchSampler(
            splr.RandomSampler(self.labels_tensor), self.batch_size, False)
        self.sampler_iter = iter(self.sampler)
Exemplo n.º 18
0
def create_dataloader(config, data, mode):
    dataset = create_dataset(config, data, mode)
    if mode == 'train':
        # create Sampler
        if dist.is_available() and dist.is_initialized():
            train_RandomSampler = distributed.DistributedSampler(dataset)
        else:
            train_RandomSampler = sampler.RandomSampler(dataset, replacement=False)

        train_BatchSampler = sampler.BatchSampler(train_RandomSampler,
                                              batch_size=config.train.batch_size,
                                              drop_last=config.train.dataloader.drop_last)

        # Augment
        collator = get_collate_fn(config)

        # DataLoader
        data_loader = DataLoader(dataset=dataset,
                                batch_sampler=train_BatchSampler,
                                collate_fn=collator,
                                pin_memory=config.train.dataloader.pin_memory,
                                num_workers=config.train.dataloader.work_nums)

    elif mode == 'val':
        if dist.is_available() and dist.is_initialized():
            val_SequentialSampler = distributed.DistributedSampler(dataset)
        else:
            val_SequentialSampler = sampler.SequentialSampler(dataset)

        val_BatchSampler = sampler.BatchSampler(val_SequentialSampler,
                                                batch_size=config.val.batch_size,
                                                drop_last=config.val.dataloader.drop_last)
        data_loader = DataLoader(dataset,
                                batch_sampler=val_BatchSampler,
                                pin_memory=config.val.dataloader.pin_memory,
                                num_workers=config.val.dataloader.work_nums)
    else:
        if dist.is_available() and dist.is_initialized():
            test_SequentialSampler = distributed.DistributedSampler(dataset)
        else:
            test_SequentialSampler = None

        data_loader = DataLoader(dataset,
                                 sampler=test_SequentialSampler,
                                 batch_size=config.test.batch_size,
                                 pin_memory=config.val.dataloader.pin_memory,
                                 num_workers=config.val.dataloader.work_nums)
    return data_loader
Exemplo n.º 19
0
def generate_sampler(dataset, sampler_option='random', step=1):
    """
    Returns sampler according to the wanted options

    :param dataset: (MRIDataset) the dataset to sample from
    :param sampler_option: (str) choice of sampler
    :param step: (int) step to discretize ages and give a weight per class
    :return: (Sampler)
    """

    df = dataset.df
    min_age = np.min(df.age)
    max_age = np.max(df.age)

    if (max_age - min_age) % step == 0:
        max_age += step

    bins = np.arange(min_age, max_age, step)
    count = np.zeros(len(bins))
    for idx in df.index:
        age = df.loc[idx, "age"]
        key = np.argmax(np.logical_and(age - step < bins,
                                       age >= bins)).astype(int)
        count[key] += 1

    # weight_per_class = (1 / np.array(count)) if count.any() != 0 else 0.
    weight_per_class = np.zeros_like(count).astype(float)
    np.divide(1., count, out=weight_per_class, where=count != 0)
    weights = [0] * len(df)

    for idx, age in enumerate(df.age.values):
        key = np.argmax(np.logical_and(age - 5 <= bins,
                                       age > bins)).astype(int)
        weights[idx] = weight_per_class[key]

    weights = torch.FloatTensor(weights)

    if sampler_option == 'random':
        s = sampler.RandomSampler(dataset, replacement=False)
    elif sampler_option == 'weighted':
        s = sampler.WeightedRandomSampler(weights, len(weights))
    else:
        raise NotImplementedError(
            "The option %s for sampler is not implemented" % sampler_option)

    return s
Exemplo n.º 20
0
def get_dataloader(dataset,
                   balance_data,
                   batch_size,
                   num_workers,
                   shuffle=True):
    if balance_data:
        weights = dataset.get_data_weights(balance_data)
        sampler_ = sampler.WeightedRandomSampler(weights, len(weights))
    elif shuffle:
        sampler_ = sampler.RandomSampler(dataset)
    else:
        sampler_ = None
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             sampler=sampler_,
                                             num_workers=num_workers)

    return dataloader
Exemplo n.º 21
0
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_cache_type = opt.get('batch_sort_cache')
        # One can specify a collate function to use for preparing a batch
        self.opt = copy.deepcopy(opt)
        self.is_shared = shared is not None
        dataset_class, self.collate_fn = self.get_dataset_class(opt)
        opt['dataset_class'] = dataset_class
        opt['collate_fn'] = self.collate_fn

        if not shared:
            self.dataset = dataset_class(opt)
            if self.datatype == 'train' and not isinstance(
                    self.dataset, StreamDataset):
                data_sampler = sampler.RandomSampler(self.dataset)
            else:
                data_sampler = sampler.SequentialSampler(self.dataset)
            pin_memory = not isinstance(self.dataset, StreamDataset)
            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                shuffle=False,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=pin_memory,
                drop_last=False,
            )
            self.lastYs = [None] * self.bsz
            if self.batch_cache_type != 'none':
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Exemplo n.º 22
0
    def create_dataloaders(self, train_labels, balance_weights: bool = True):
        '''create dataloaders based on split from StratifiedKFold'''

        sampler = (data_loaders.balanced_sampler(train_labels)
                   if balance_weights else None)
        train_loader = data_loaders.create_loader(self.train_data,
                                                  batch_size=self.batch_size,
                                                  sampler=sampler)

        if config.VALID_SIZE < 0.01:
            # use all data for training - no val loader
            val_loader = None
        else:
            val_sampler = samp.RandomSampler(self.val_data)
            val_loader = data_loaders.create_loader(self.val_data,
                                                    batch_size=100,
                                                    sampler=val_sampler)
            if config.SAVE_MODEL:
                data_loaders.save_valloader(self.val_data)

        dataloaders_dict = {"train": train_loader, "val": val_loader}
        return dataloaders_dict
Exemplo n.º 23
0
    def __init__(self,
                 storage,
                 sampler=None,
                 num_batches=10,
                 batch_size=None,
                 batch_size_bounds=None,
                 replacement=True,
                 verbose=0):
        """
        Initialize the storage sampler.

        Args:
            storage (Storage): storage sampler.
            sampler (Sampler, None): If None, it will use a sampler that randomly sample batches of the storage. It
                will by default sample :attr:`num_batches`.
            num_batches (int): number of batches.
            batch_size (int, None): size of the batch. If None, it will be computed based on the size of the storage,
                where batch_size = size(storage) // num_batches. Note that the batch size must be smaller than the size
                of the storage itself. The num_batches * batch_size can however be bigger than the storage size if
                :attr:`replacement = True`.
            batch_size_bounds (tuple of int, None): if :attr:`batch_size` is None, we can instead specify the lower
                and upper bounds for the `batch_size`. For instance, we can set it to `(16, 128)` along with
                `batch_size=None`; this will result to compute `batch_size = size(storage) // num_batches` but if this
                one is too small (<16), it will be set to 16, and if this one is too big (>128), it will be set to 128.
            replacement (bool): if we should sample each element only one time, or we can sample the same ones multiple
                times.
            verbose (int, bool): verbose level, select between {0, 1, 2}. If 0=False, it won't print anything. If
                1=True, it will print basic information about the sampler. If verbose=2, it will print detailed
                information.
        """
        # set the storage
        self.storage = storage

        # set variables
        self._num_batches = num_batches
        self._replacement = bool(replacement)
        self._batch_size_bounds = batch_size_bounds
        self._batch_size_given = batch_size is not None
        self._verbose = verbose

        # set the sampler
        if sampler is None:

            # check batch size and compute it if necessary
            if batch_size is None:
                batch_size = self.size // num_batches

            # check batch size bounds
            if isinstance(batch_size_bounds,
                          (tuple, list)) and len(batch_size_bounds) == 2:
                if batch_size < batch_size_bounds[0]:
                    batch_size = batch_size_bounds[0]
                elif batch_size > batch_size_bounds[1]:
                    batch_size = batch_size_bounds[1]

            # check the batch size * number of batches wrt the storage size
            if batch_size * num_batches > self.size and not self.replacement:
                raise ValueError(
                    "Expecting the batch size (={}) * num_batches (={}) to be smaller than the size of "
                    "the storage (={}), if we can not use replacement.".format(
                        batch_size, num_batches, self.size))

            # subsampler
            if replacement:
                subsampler = torch_sampler.RandomSampler(
                    data_source=range(self.size),
                    replacement=replacement,
                    num_samples=self.size)
            else:
                subsampler = torch_sampler.SubsetRandomSampler(
                    indices=range(self.size))

            # create sampler
            sampler = torch_sampler.BatchSampler(sampler=subsampler,
                                                 batch_size=batch_size,
                                                 drop_last=True)

        self.sampler = sampler

        if verbose:
            print(
                "\nCreating sampler with size: {} - num batches: {} - batch size: {}"
                .format(self.size, num_batches, self.batch_size))
Exemplo n.º 24
0
def train_model(args):
    with open("tacotron/config.toml") as file:
        cfg = toml.load(file)

    tensorboard_path = Path("tensorboard") / args.checkpoint_dir
    checkpoint_dir = Path(args.checkpoint_dir)
    writer = SummaryWriter(tensorboard_path)

    tacotron = Tacotron(**cfg["model"]).cuda()
    optimizer = optim.Adam(tacotron.parameters(),
                           lr=cfg["train"]["optimizer"]["lr"])
    scaler = amp.GradScaler()
    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer=optimizer,
        milestones=cfg["train"]["scheduler"]["milestones"],
        gamma=cfg["train"]["scheduler"]["gamma"],
    )

    if args.resume is not None:
        global_step = load_checkpoint(
            tacotron=tacotron,
            optimizer=optimizer,
            scaler=scaler,
            scheduler=scheduler,
            load_path=args.resume,
        )
    else:
        global_step = 0

    root_path = Path(args.dataset_dir)
    text_path = Path(args.text_path)

    dataset = TTSDataset(root_path, text_path)
    sampler = samplers.RandomSampler(dataset)
    batch_sampler = BucketBatchSampler(
        sampler=sampler,
        batch_size=cfg["train"]["batch_size"],
        drop_last=True,
        sort_key=dataset.sort_key,
        bucket_size_multiplier=cfg["train"]["bucket_size_multiplier"],
    )
    collate_fn = partial(
        pad_collate,
        reduction_factor=cfg["model"]["decoder"]["reduction_factor"])
    loader = DataLoader(
        dataset,
        batch_sampler=batch_sampler,
        collate_fn=collate_fn,
        num_workers=cfg["train"]["n_workers"],
        pin_memory=True,
    )

    n_epochs = cfg["train"]["n_steps"] // len(loader) + 1
    start_epoch = global_step // len(loader) + 1

    for epoch in range(start_epoch, n_epochs + 1):
        average_loss = 0

        for i, (mels, texts, mel_lengths, text_lengths,
                attn_flag) in enumerate(tqdm(loader), 1):
            mels, texts = mels.cuda(), texts.cuda()

            optimizer.zero_grad()

            with amp.autocast():
                ys, alphas = tacotron(texts, mels)
                loss = F.l1_loss(ys, mels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            clip_grad_norm_(tacotron.parameters(),
                            cfg["train"]["clip_grad_norm"])
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            global_step += 1

            average_loss += (loss.item() - average_loss) / i

            if global_step % cfg["train"]["checkpoint_interval"] == 0:
                save_checkpoint(
                    tacotron=tacotron,
                    optimizer=optimizer,
                    scaler=scaler,
                    scheduler=scheduler,
                    step=global_step,
                    checkpoint_dir=checkpoint_dir,
                )

            if attn_flag:
                index = attn_flag[0]
                alpha = alphas[
                    index, :text_lengths[index], :mel_lengths[index] // 2]
                alpha = alpha.detach().cpu().numpy()

                y = ys[index, :, :].detach().cpu().numpy()
                log_alignment(alpha, y, cfg["preprocess"], writer, global_step)

        writer.add_scalar("loss", average_loss, global_step)
        print(
            f"epoch {epoch} : loss {average_loss:.4f} : {scheduler.get_last_lr()}"
        )
Exemplo n.º 25
0
def train(model, name, train_dataset, test_dataset):
    train_sampler = tsampler.RandomSampler(train_dataset)

    train_loader = tdata.DataLoader(train_dataset,
                                    batch_size=len(train_dataset),
                                    sampler=train_sampler)

    optimizer = toptim.Adadelta(model.parameters(), weight_decay=0.001)
    last_loss = 1e10
    last_test_loss = 1e19

    epoch = 0
    while True:
        for i, d in enumerate(train_loader):
            model.train()
            if hasattr(model, 'before_epoch'):
                model.before_epoch(epoch)
            epoch += 1
            data_var = _make_variable(d[0])
            target_var = _make_variable(d[1]).resize(len(d[1]))

            criterion = tnn.MSELoss()
            optimizer.zero_grad()
            output_f, output_g = model.forward(data_var)
            # print('output=', output)
            # print('output shape:', output.shape, 'target_var_shape:', target_var.shape)
            # print(output_g, target_var)
            print(output_g.data.shape, ' target shape:', target_var.data.shape)
            if epoch % 10 == 0:
                print(((output_g - target_var)**2).sum() / len(output_g))
            loss = criterion(output_g, target_var)
            loss.backward()
            optimizer.step()
            print('Batch {} | loss {}'.format(epoch, loss.data[0]))
            if (loss.data[0] < model.LOSS_LIMIT
                    and loss.data[0] < last_loss) or epoch % 10 == 0:
                test_loss = evaluate(model, test_dataset)
                test_loss_num = test_loss.data[0]
                print('Evaluated test loss {}'.format(test_loss_num))
                if (test_loss_num < model.LOSS_LIMIT
                        and test_loss_num < last_test_loss) or epoch % 50 == 0:
                    last_test_loss = test_loss_num
                    os.makedirs('saved_model/' + name, exist_ok=True)
                    torch.save(
                        model.state_dict(),
                        os.path.join(
                            'saved_model', name,
                            '{}_loss_{}_test_{}_{}.t7'.format(
                                epoch, loss.data[0], test_loss_num,
                                datetime.datetime.now().strftime(
                                    "%I:%M%p on %B %d, %Y"))))
                    torch.save(
                        model,
                        os.path.join(
                            'saved_model', name,
                            'model_{}_loss_{}_test_{}_{}.t7'.format(
                                epoch, loss.data[0], test_loss_num,
                                datetime.datetime.now().strftime(
                                    "%I:%M%p on %B %d, %Y"))))

            last_loss = loss.data[0]
Exemplo n.º 26
0
    def train_sup(self, epoch_lim, data, valid_data, early_stopping_lim,
                  batch_size, num_workers, track_embeddings, validation_rate, loss_weight_base=1,
                  value_weight=0, value_ratio=0):
        """
        Training loop
        :param epoch_lim: total number of training epochs
        :param data: training data
        :param valid_data: validation data
        :param early_stopping_lim: Number of epochs to run without validation improvement before stopping
        if None, never stop early
        :param batch_size: training batch_size
        :param num_workers: number of CPU workers to use for data loading
        :param track_embeddings: Save out embedding information at end of run
        :param validation_rate: Check validation performance every validation_rate training epochs
        :param loss_weight_base: A constant between 0 and 1 used to interpolate between Single (=0) and Multi (=1) Step forecasting.
        :param value_weight: A constant multiplier for the real-value loss, set to 0 in the paper
        :param value_ratio: The proportion of loss used for the MSE loss term (as opposed for the cross-entropy loss), set to 0 in the paper
        :return loss array, model:
        """
        if early_stopping_lim is None:
            early_stopping_lim = epoch_lim
        train_sampler = sampler.RandomSampler(np.arange(len(data)))
        data_train = DataLoader(data,
                                batch_size=batch_size,
                                sampler=train_sampler,
                                drop_last=True)

        valid_sampler = sampler.SequentialSampler(np.arange(len(valid_data)))
        data_valid = DataLoader(valid_data,
                                batch_size=batch_size,
                                sampler=valid_sampler)
        step = 0

        bsf_loss = np.inf
        epochs_without_improvement = 0
        improvements = []
        for epoch in range(epoch_lim):
            if epochs_without_improvement > early_stopping_lim:
                print('Exceeded early stopping limit, stopping')
                break
            if epoch % validation_rate == 0:
                valid_loss = self.validation(data_valid=data_valid,
                                             step=step,
                                             data=data,
                                             loss_weight_base=loss_weight_base,
                                             value_weight=value_weight, value_ratio=value_ratio)
                (bsf_loss,
                 epochs_without_improvement,
                 improvements) = self.manage_early_stopping(bsf_loss=bsf_loss,
                                                            early_stopping_lim=early_stopping_lim,
                                                            epochs_without_improvement=epochs_without_improvement,
                                                            valid_loss=valid_loss, validation_rate=validation_rate,
                                                            improvements=improvements)
            running_train_loss = 0
            for inp, out, out_real, lens in tqdm(data_train):
                loss, y_p = forecast_model.get_loss(inp=inp,
                                                    out=out,
                                                    lens=lens,
                                                    cuda=True,
                                                    gn=self.model,
                                                    glucose_dat=data,
                                                    criterion=self.criterion,
                                                    base=loss_weight_base,
                                                    out_real=out_real,
                                                    value_weight=value_weight,
                                                    value_ratio=value_ratio)
                step += 1
                running_train_loss += loss.data.cpu().numpy()[0]
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            running_train_loss = running_train_loss/len(data_train)
            self.writer.add_scalar(tag='train_loss',
                                   scalar_value=running_train_loss,
                                   global_step=step)
        torch.save(self.model.state_dict(), '{}/final_sup.pt'.format(self.model_dir))
        if track_embeddings:
            self.embed(data_valid, step, embed_batch=100)
        return improvements
    def __init__(self, opt, shared=None):
        opt['batch_sort'] = False
        super().__init__(opt, shared)
        self.use_batch_act = self.bsz > 1
        self.num_workers = opt['numworkers']
        self.batch_sort = opt.get('pytorch_teacher_batch_sort') and \
            'train' in self.datatype
        self.batch_cache_type = opt.get('batch_sort_cache_type')
        self.batch_sort_field = opt.get('batch_sort_field')
        # One can specify a collate function to use for preparing a batch
        self.opt = opt.copy()
        self.is_shared = shared is not None
        dataset_classes = self.get_dataset_class(opt)
        self.ordered = ('ordered' in self.datatype or
                        ('stream' in self.datatype and not opt.get('shuffle')))
        if self.ordered:
            # force index for ordered, so that we see every example
            warn_once('\nNote: You are using PytorchDataTeacher with ordered '
                      'examples. Please specify `--shuffle` if you would like '
                      'to have examples loaded in randomized order.\n')
            self.batch_cache_type = 'index'

        if not shared:
            BatchSortCache.create()
            if len(dataset_classes) > 1:
                datasets = []
                for class_name, collate_fn, task_name in dataset_classes:
                    dataset_opt = opt.copy()
                    dataset_opt['pytorch_teacher_task'] = task_name
                    dataset_opt['task'] = task_name
                    datasets.append(class_name(dataset_opt))
                    self.collate_fn = collate_fn
                self.id = ','.join([d[2] for d in dataset_classes])
                self.dataset = ParlAIConcatDataset(datasets)
            else:
                class_name, self.collate_fn, task_name = dataset_classes[0]
                self.id = task_name
                self.dataset = class_name(opt)
            if self.ordered or not self.training:
                data_sampler = sampler.SequentialSampler(self.dataset)
            else:
                data_sampler = sampler.RandomSampler(self.dataset)

            self.pytorch_dataloader = DataLoader(
                self.dataset,
                batch_size=self.bsz,
                sampler=data_sampler,
                num_workers=self.num_workers,
                collate_fn=self.collate_fn,
                pin_memory=False,
                drop_last=False,
            )

            self.lastYs = [None] * self.bsz
            if self.batch_sort:
                self.loader_process = LoaderProcess(opt)
                self.loader_process.start()
            self.data = enumerate(self.pytorch_dataloader)
        else:
            self.dataset = shared['dataset']
            self.pytorch_dataloader = shared['pytorch_dataloader']
            self.lastYs = shared['lastYs']
            self.data = shared['data']
            self.id = shared['id']

        self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz)
        self.reset()
Exemplo n.º 28
0
                            num_channels=1))

    dataset = ConcatDataset(dataset) if len(dataset) > 1 else dataset[0]
    target_type = params['target_type'] if params['target_type'] != 'spatial_bootstrap' else 'psa'
    val_dataset = WSJ0(folder=params['validation_folder'],
                       length='full',
                       n_fft=params['n_fft'],
                       hop_length=params['hop_length'],
                       output_type=target_type,
                       create_cache=True, #params['create_cache'],
                       num_channels=1)

if args.sample_strategy == 'sequential':
    sample_strategy = sampler.SequentialSampler(dataset)
elif args.sample_strategy == 'random':
    sample_strategy = sampler.RandomSampler(dataset)

dataloader = DataLoader(dataset,
                            batch_size=params['batch_size'],
                            num_workers=params['num_workers'],
                            sampler=sample_strategy)

dummy_input, _, _, _, _, dummy_one_hot = dataset[0]

params['num_attractors'] = dummy_one_hot.shape[-1]
params['num_sources'] = params['num_attractors']
params['sample_rate'] = dataset.sr
dataset.reorder_sources = args.reorder_sources
val_dataset.reorder_sources = args.reorder_sources

pp.pprint(params)
Exemplo n.º 29
0
    def __init__(self, dataset_path=const.DATASET_PATH, train=True, test=False, load=None,\
                    num_classes=const.NUM_JOINTS, backbone_name=const.BACKBONE_NAME, backbone_pre_trained=const.PRE_TRAINED,\
                    target_size=const.TARGET_SIZE, stride=const.STRIDE, p_h=None, p_w=None, spacial_factor=const.SPACIAL_FACTOR,\
                    lr=const.LR_RATE, w_d=const.WEIGHT_DECAY, step_size=const.STEP_SIZE, gamma=const.GAMMA, reg_loss_fac=const.REG_LOSS_FACTOR,\
                    bs=const.BATCH_SIZE, max_epoch=const.MAX_EPOCH, save_path=const.SAVE_PATH, save_freq=const.SAVE_FREQ, train_spit=const.TRAIN_VAL_SPLIT):

        print("Setting up model...")
        self.dataset_path = dataset_path
        self.save_path = save_path

        self.num_classes = num_classes
        self.backbone_name = backbone_name

        self.train = train
        self.test = test

        self.max_epoch = max_epoch
        self.save_freq = save_freq

        self.model = A2J(num_joints=num_classes,
                         backbone_name=backbone_name,
                         backbone_pretrained=backbone_pre_trained)
        self.reg_loss_factor = reg_loss_fac

        self.post_precess = PostProcess(shape=(target_size[1] // 16,
                                               target_size[0] // 16),
                                        stride=stride,
                                        p_h=p_h,
                                        p_w=p_w)
        self.optim = torch.optim.Adam(self.model.parameters(),
                                      lr=lr,
                                      weight_decay=w_d)
        self.sched = torch.optim.lr_scheduler.StepLR(self.optim,
                                                     step_size=step_size,
                                                     gamma=gamma)
        self.criterion = A2JLoss(shape=(target_size[1]//16, target_size[0]//16), stride=stride,\
                                                    spacial_factor=spacial_factor, p_h=p_h, p_w=p_w)
        if load:
            print(f"Loading model...{load}")
            check_point = torch.load(load, map_location=torch.device('cpu'))
            self.num_class = check_point["num_classes"]
            self.model = A2J(num_joints=num_classes,
                             backbone_name=backbone_name,
                             backbone_pretrained=backbone_pre_trained)
            self.model.load_state_dict(check_point["model"])
            self.optim.load_state_dict(check_point["optim"])
            self.sched.load_state_dict(check_point["sched"])
            self.epoch = check_point["epoch"] + 1
        else:
            self.epoch = 0

        self.train_data = A2J_NYU_DataLoader(train=True,
                                             dataset_path=self.dataset_path)
        self.valid_data = A2J_NYU_DataLoader(train=False,
                                             dataset_path=self.dataset_path)
        self.test_data = A2J_NYU_DataLoader(train=False,
                                            dataset_path=self.dataset_path)

        self.load_train = DataLoader(
            self.train_data,
            batch_size=bs,
            sampler=sampler.RandomSampler(self.train_data),
            drop_last=False,
            num_workers=8,
        )

        self.load_valid = DataLoader(
            self.valid_data,
            batch_size=bs // 4,
            sampler=sampler.RandomSampler(self.valid_data),
            drop_last=False,
            num_workers=8,
        )

        self.load_test = DataLoader(
            self.test_data,
            batch_size=bs // 8,
            sampler=sampler.RandomSampler(self.test_data),
            drop_last=False,
            num_workers=8,
        )
        print("Model setup finished!")
Exemplo n.º 30
0
def train_model(model,
                train,
                val,
                learning_rate,
                batch_size=16,
                epochs=1,
                dtype=torch.float32,
                device=DEFAULT_DEVICE,
                verbose=False):

    model.cuda(device)

    _, nonempty_train = split_empty_nonempty(train)
    _, nonempty_val = split_empty_nonempty(val)

    if verbose:
        print(
            f'{len(nonempty_train)}/{len(train)} training examples are non-empty'
        )
        print(
            f'{len(nonempty_val)}/{len(val)} validation examples are non-empty'
        )

    nonempty_train_ds = Subset(train, nonempty_train)
    nonempty_val_ds = Subset(val, nonempty_val)

    train_loader = DataLoader(nonempty_train_ds,
                              batch_size=batch_size,
                              sampler=sampler.RandomSampler(
                                  nonempty_train_ds, True, len(train)))

    for l in model:
        if hasattr(l, '_should_init'):
            nn.init.kaiming_normal_(l.weight)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_history, train_history, val_history = [], empty_results_history(
        3), empty_results_history(3)

    it = 0

    for e in range(epochs):
        if verbose:
            print(f'Epoch {e+1}')

        n_iters = (len(train) + batch_size - 1) // batch_size

        for t, (x, y) in enumerate(train_loader):
            model.train()

            x = x.to(device=device, dtype=dtype)
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)

            loss = F.cross_entropy(scores, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_history.append(loss.item())

            if it % 100 == 0 or (e + 1 == epochs and t + 1 == n_iters):
                train_result = evaluate_model(model, nonempty_train_ds,
                                              [0, 1, 2], 1000)
                record_results(train_history, train_result)

                val_result = evaluate_model(model, nonempty_val_ds, [0, 1, 2],
                                            1000)
                record_results(val_history, val_result)

                if verbose:
                    print(
                        f'Iteration {t}, loss={loss.item()}, ' +
                        f'prec={format_metric(train_result.precision)}/{format_metric(val_result.precision)}, '
                        +
                        f'recall={format_metric(train_result.recall)}/{format_metric(val_result.recall)}, '
                        +
                        f'acc={format_metric(train_result.accuracy)}/{format_metric(val_result.accuracy)}, '
                        +
                        f'f1={format_metric(train_result.f1)}/{format_metric(val_result.f1)}, '
                        +
                        f'kappa={format_metric(train_result.kappa)}/{format_metric(val_result.kappa)}, '
                    )

            it += 1

    return loss_history, train_history, val_history, nonempty_train