Пример #1
0
    def split_train_test(self, data_files, limit=None):
        test = []
        train = []
        for f in data_files:
            if self.format == 'csv':
                temp = ChessDataset(f)
            else:
                temp = LMDBChessDataset(f)
            if limit:
                test_len = round(limit * self.test_ratio)
            else:
                test_len = round(len(temp) * self.test_ratio)
            del temp
            if self.format == 'csv':
                test.append(ChessDataset(f, limit=test_len))
                train.append(ChessDataset(f, limit=limit, offset=test_len))
            elif self.format == 'lmdb':
                test.append(LMDBChessDataset(f, limit=test_len))
                train.append(LMDBChessDataset(f, limit=limit, offset=test_len))

        if len(train) == 1:
            train_dataset = train[0]
        elif len(train) == 2:
            train_dataset = InterleavenDataset(train)
        else:
            train_dataset = data.ConcatDataset(train)

        return data.DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            num_workers=4,
            shuffle=(self.format == 'csv')), data.DataLoader(
                data.ConcatDataset(test),
                batch_size=self.batch_size,
            )
Пример #2
0
def load_mnist_plus_fmnist(args, **kwargs):
    args.input_size = [1, 28, 28]
    args.input_type = 'gray'
    args.dynamic_binarization = False

    import torch.utils.data as data_utils
    
    train_loader, val_loader, test_loader, _ = load_dynamic_mnist(args)
    train_loader2, val_loader2, test_loader2, _ = load_fashion_mnist(args, label_offset=10)

    train_dataset = data_utils.ConcatDataset([train_loader.dataset, train_loader2.dataset]) 
    val_dataset = data_utils.ConcatDataset([val_loader.dataset, val_loader2.dataset]) 
    test_dataset = data_utils.ConcatDataset([test_loader.dataset, test_loader2.dataset]) 

    shuffle = True
    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    
    train_loader = data_utils.DataLoader(train_dataset, batch_size=args.batch_size,
                                         shuffle=shuffle, **kwargs)
    val_loader = data_utils.DataLoader(val_dataset, batch_size=args.batch_size,
                                         shuffle=shuffle, **kwargs)
    test_loader = data_utils.DataLoader(test_dataset, batch_size=args.batch_size,
                                         shuffle=shuffle, **kwargs)

    return train_loader, val_loader, test_loader, args
Пример #3
0
def prepare_data_seq(task, batch_size=100, shuffle=True):
    path_train = 'data/chitchat/data_mixup/train_'
    path_dev = 'data/chitchat/data_mixup/dev_'
    # provide the data mixup rates

    dict_file_path = "data/chitchat/vocab_pool/vocab_filter1000_18394.txt"
    lang = Lang(dict_file_path)

    train_datasets = []
    dev_datasets = []
    for file_name, weight in SRC_WEIGHTS.items():
        train_file = path_train + file_name
        dev_file = path_dev + file_name
        train_sub_dat = SubDataset(lang, train_file, weight, max_len)
        dev_sub_dat = SubDataset(lang, dev_file, weight, max_len)
        train_datasets.append(train_sub_dat)
        dev_datasets.append(dev_sub_dat)

    train_datasets = data.ConcatDataset(train_datasets)
    dev_datasets = data.ConcatDataset(dev_datasets)

    train = get_seq(train_datasets, lang, batch_size, True, max_len)
    dev = get_seq(dev_datasets, lang, batch_size, False, max_len)

    test = None

    logging.info("Read %s sentence pairs train" % train.__len__())
    logging.info("Read %s sentence pairs dev" % dev.__len__())
    logging.info("Read %s sentence pairs test" % 0)
    logging.info("Max len Input %s " % max_len)
    logging.info("Vocab_size %s " % lang.n_words)
    logging.info("USE_CUDA={}".format(USE_CUDA))

    return train, dev, test, [], lang, max_len, max_r
Пример #4
0
def dataset_augmentation(dataset_old, scale = 10):
    tot_len = len(dataset_old)
    print(tot_len)
    dataset = data_utils.ConcatDataset([dataset_old])
    for i in range(scale - 1):
        datasets = data_utils.random_split(dataset_old, [int(tot_len/2), tot_len - int(tot_len/2)])
        dataset = data_utils.ConcatDataset([dataset] + datasets)
    print(len(dataset))
    return dataset
Пример #5
0
def load_partition_data_federated_emnist(dataset,
                                         data_dir,
                                         batch_size=DEFAULT_BATCH_SIZE):

    # client ids
    train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE)
    test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE)
    print(train_file_path)
    with h5py.File(train_file_path,
                   'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5:
        global client_ids_train, client_ids_test
        client_ids_train = list(train_h5[_EXAMPLE].keys())
        client_ids_test = list(test_h5[_EXAMPLE].keys())

    # local dataset
    data_local_num_dict = dict()
    train_data_local_dict = dict()
    test_data_local_dict = dict()

    for client_idx in range(DEFAULT_TRAIN_CLIENTS_NUM):
        train_data_local, test_data_local = get_dataloader(
            dataset, data_dir, batch_size, batch_size, client_idx)
        local_data_num = len(train_data_local) + len(test_data_local)
        data_local_num_dict[client_idx] = local_data_num
        # logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
        # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
        #     client_idx, len(train_data_local), len(test_data_local)))
        train_data_local_dict[client_idx] = train_data_local
        test_data_local_dict[client_idx] = test_data_local

    # global dataset
    train_data_global = data.DataLoader(data.ConcatDataset(
        list(dl.dataset for dl in list(train_data_local_dict.values()))),
                                        batch_size=batch_size,
                                        shuffle=True)
    train_data_num = len(train_data_global.dataset)

    test_data_global = data.DataLoader(data.ConcatDataset(
        list(dl.dataset for dl in list(test_data_local_dict.values())
             if dl is not None)),
                                       batch_size=batch_size,
                                       shuffle=True)
    test_data_num = len(test_data_global.dataset)

    # class number
    train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE)
    with h5py.File(train_file_path, 'r') as train_h5:
        class_num = len(
            np.unique([
                train_h5[_EXAMPLE][client_ids_train[idx]][_LABEL][0]
                for idx in range(DEFAULT_TRAIN_CLIENTS_NUM)
            ]))
        logging.info("class_num = %d" % class_num)

    return DEFAULT_TRAIN_CLIENTS_NUM, train_data_num, test_data_num, train_data_global, test_data_global, \
           data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
Пример #6
0
 def forward(self, x):
     x = F.relu(self.conv1_1(x))  # conv1_1 & relu1_1
     x = F.relu(self.conv1_2(x))  # conv1_2 & relu1_2
     x = F.max_pool2d(x, 2)  # pool1
     x = F.relu(self.conv2_1(x))  # conv2_1 & relu2_1
     x = F.relu(self.conv2_2(x))  # conv2_2 & relu2_2
     x = F.max_pool2d(x, 2)  # pool2
     x = F.relu(self.conv3_1(x))  # relu3_1
     x = F.relu(self.conv3_2(x))  # relu3_2
     x = F.relu(self.conv3_3(x))  # relu3_3
     x = F.max_pool2d(x, 2)  # pool3
     x = F.relu(self.conv4_1(x))  # relu4_1
     x = F.relu(self.conv4_2(x))  # relu4_2
     x = F.relu(self.conv4_3(x))  # relu4_3
     x = F.max_pool2d(x, 2)  # pool4
     # conv4_3_norm=F.batch_norm(x,)#conv4_3_norm
     x = F.relu(self.conv5_1(x))  # relu5_1
     x = F.relu(self.conv5_2(x))  # relu5_2
     x = F.relu(self.conv5_3(x))  # relu5_3
     x = F.max_pool2d(x, 3, stride=1, padding=1)  # pool5
     x = F.relu(self.conv_fc6(x))  # relu6
     x = F.relu(self.conv_fc7(x))  # relu7
     x = F.relu(self.conv6_1(x))  # relu6_1
     x = F.relu(self.conv6_2(x))  # relu6_2
     x = F.relu(self.conv7_1(x))  # relu7_1
     # conv6_2_conf
     conv6_2_conf = self.conv6_2_conf(x).permute(2, 0, 1)
     conv6_2_conf = conv6_2_conf.view(-1, len(conv6_2_conf))
     # conv6_2_loc
     conv6_2_loc = self.conv6_2_loc(x).permute(2, 0, 1)
     conv6_2_loc = conv6_2_loc.view(-1, len(conv6_2_loc))
     x = F.relu(self.conv7_2(x))  # relu7_2
     # conv7_2_conf
     conv7_2_conf = self.conv7_2_conf(x).permute(2, 0, 1)
     conv7_2_conf = conv7_2_conf.view(-1, len(conv7_2_conf))
     # conv7_2_loc
     conv7_2_loc = self.conv7_2_loc(x).permute(2, 0, 1)
     conv7_2_loc = conv7_2_loc.view(-1, len(conv7_2_loc))
     x = F.relu(self.conv8_1(x))  # relu8_1
     x = F.relu(self.conv8_2(x))  # relu8_2
     # conv8_2_conf
     conv8_2_conf = self.conv8_2_conf(x).permute(2, 0, 1)
     conv8_2_conf = conv8_2_conf.view(-1, len(conv8_2_conf))
     # conv8_2_loc
     conv8_2_loc = self.conv8_2_loc(x).permute(2, 0, 1)
     conv8_2_loc = conv8_2_loc.view(-1, len(conv8_2_loc))
     x = F.relu(self.conv9_1(x))  # relu9_1
     x = F.relu(self.conv9_2(x))  # relu9_2
     # conv9_2_conf
     conv9_2_conf = self.conv9_2_conf(x).permute(2, 0, 1)
     conv9_2_conf = conv9_2_conf.view(-1, len(conv9_2_conf))
     # conv9_2_loc
     conv9_2_loc = self.conv9_2_loc(x).permute(2, 0, 1)
     conv9_2_loc = conv9_2_loc.view(-1, len(conv9_2_loc))
     loc = td.ConcatDataset([conv6_2_loc, conv7_2_loc, conv8_2_loc, conv9_2_loc])  # Concat mbox_loc
     conf = td.ConcatDataset([conv6_2_conf, conv7_2_conf, conv8_2_conf, conv9_2_conf])  # Concat mbox_conf_loc
Пример #7
0
def get_concat_dataloader(data_root, batch_size=64, download=False):
    transforms_train = transforms.Compose([
        transforms.Resize(size=224),
        transforms.RandomCrop(size=(224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    transforms_val = transforms.Compose([
        transforms.Resize(size=224),
        transforms.CenterCrop(size=(224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    cub_root = os.path.join(data_root, 'cub200')
    train_cub = CUB200(root=cub_root,
                       split='train',
                       transforms=transforms_train,
                       download=download,
                       offset=0)
    val_cub = CUB200(root=cub_root,
                     split='test',
                     transforms=transforms_val,
                     download=False,
                     offset=0)
    dogs_root = os.path.join(data_root, 'dogs')
    train_dogs = StanfordDogs(root=dogs_root,
                              split='train',
                              transforms=transforms_train,
                              download=download,
                              offset=200)
    val_dogs = StanfordDogs(root=dogs_root,
                            split='test',
                            transforms=transforms_val,
                            download=False,
                            offset=200)  # add offset
    train_dst = data.ConcatDataset([train_cub, train_dogs])
    val_dst = data.ConcatDataset([val_cub, val_dogs])

    train_loader = data.DataLoader(train_dst,
                                   batch_size=batch_size,
                                   drop_last=True,
                                   shuffle=True,
                                   num_workers=4)
    val_loader = data.DataLoader(val_dst,
                                 batch_size=batch_size,
                                 drop_last=True,
                                 shuffle=False,
                                 num_workers=4)
    return train_loader, val_loader
Пример #8
0
 def setup(self, stage):
     self.train_data = data.ConcatDataset([
         torchaudio.datasets.LIBRISPEECH(self.hparams.data_root,
                                         url=path,
                                         download=True)
         for path in self.hparams.data_train
     ])
     self.test_data = data.ConcatDataset([
         torchaudio.datasets.LIBRISPEECH(self.hparams.data_root,
                                         url=path,
                                         download=True)
         for path in self.hparams.data_test
     ])
Пример #9
0
def load_partition_data_federated_cifar100(data_dir, batch_size=DEFAULT_BATCH_SIZE):
    class_num = 100

    # client id list
    train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE)
    test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE)
    with h5py.File(train_file_path, 'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5:
        global client_ids_train, client_ids_test
        client_ids_train = list(train_h5[_EXAMPLE].keys())
        client_ids_test = list(test_h5[_EXAMPLE].keys())

    # get local dataset
    data_local_num_dict = dict()
    train_data_local_dict = dict()
    test_data_local_dict = dict()

    for client_idx in range(DEFAULT_TRAIN_CLINETS_NUM):
        train_data_local, test_data_local = get_dataloader(
            data_dir, batch_size, batch_size, client_idx)
        local_data_num = len(train_data_local.dataset)
        data_local_num_dict[client_idx] = local_data_num
#        logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
#        logging.info("client_idx = %d, batch_num_train_local = %d" % (client_idx, len(train_data_local)))
        train_data_local_dict[client_idx] = train_data_local
        test_data_local_dict[client_idx] = test_data_local

    # global dataset
#    train_data_global = data.DataLoader(
#       data.ConcatDataset(
#            list(dl.dataset for dl in list(train_data_local_dict.values()))
#        ),
#        batch_size=batch_size, shuffle=True)
#    train_data_num = len(train_data_global.dataset)

#    test_data_global = data.DataLoader(
#        data.ConcatDataset(
#            list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None)
#        ),
#        batch_size=batch_size, shuffle=True)
#    test_data_num = len(test_data_global.dataset)
    train_data_global = data.ConcatDataset(
            list(dl.dataset for dl in list(train_data_local_dict.values()))
        )
    test_data_global = data.ConcatDataset(
            list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None))

    # DEFAULT_TRAIN_CLINETS_NUM = 500, train_data_num = 50000, test_data_num = 10000
    # train_data_global=5만개 datalodaer, test_data_globa = 1만개 dataloader -> dataset필요시. train_data_global.dataset
    # data_local_num_dict = client data개수 사전, train_data_local_dcit = client dataloader dict,

    return train_data_global, test_data_global, train_data_local_dict,
Пример #10
0
def get_dataloader(dataset, data_dir, train_bs, test_bs, client_idx=None):

    if client_idx is None:

        train_dl = data.DataLoader(data.ConcatDataset(
            StackOverflowDataset(
                os.path.join(data_dir, DEFAULT_TRAIN_FILE), client_idx,
                "train", {
                    "input": lambda x: utils.preprocess_input(x, data_dir),
                    "target": lambda y: utils.preprocess_target(y, data_dir)
                }) for client_idx in range(DEFAULT_TRAIN_CLINETS_NUM)),
                                   batch_size=batch_size,
                                   shuffle=True)

        test_dl = data.DataLoader(data.ConcatDataset(
            StackOverflowDataset(
                os.path.join(data_dir, DEFAULT_TEST_FILE), client_idx, "test",
                {
                    "input": lambda x: utils.preprocess_input(x, data_dir),
                    "target": lambda y: utils.preprocess_target(y, data_dir)
                }) for client_idx in range(DEFAULT_TEST_CLINETS_NUM)),
                                  batch_size=batch_size,
                                  shuffle=True)
        return train_dl, test_dl

    else:
        train_ds = StackOverflowDataset(
            os.path.join(data_dir, DEFAULT_TRAIN_FILE), client_idx, "train", {
                "input": lambda x: utils.preprocess_input(x, data_dir),
                "target": lambda y: utils.preprocess_target(y, data_dir)
            })
        train_dl = data.DataLoader(dataset=train_ds,
                                   batch_size=train_bs,
                                   shuffle=True,
                                   drop_last=False)

        if client_idx >= DEFAULT_TEST_CLIENTS_NUM:
            test_dl = None
        else:
            test_ds = StackOverflowDataset(
                os.path.join(data_dir, DEFAULT_TEST_FILE), client_idx, "test",
                {
                    "input": lambda x: utils.preprocess_input(x, data_dir),
                    "target": lambda y: utils.preprocess_target(y, data_dir)
                })
            test_dl = data.DataLoader(dataset=test_ds,
                                      batch_size=test_bs,
                                      shuffle=True,
                                      drop_last=False)

        return train_dl, test_dl
Пример #11
0
def load_partition_data_federated_shakespeare(dataset,
                                              data_dir,
                                              batch_size=DEFAULT_BATCH_SIZE):

    #client id list
    train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE)
    test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE)
    with h5py.File(train_file_path,
                   'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5:
        global client_ids_train, client_ids_test
        client_ids_train = list(train_h5[_EXAMPLE].keys())
        client_ids_test = list(test_h5[_EXAMPLE].keys())

    # get local dataset
    data_local_num_dict = dict()
    train_data_local_dict = dict()
    test_data_local_dict = dict()

    for client_idx in range(DEFAULT_TRAIN_CLIENTS_NUM):
        train_data_local, test_data_local = get_dataloader(
            dataset, data_dir, batch_size, batch_size, client_idx)
        local_data_num = len(train_data_local.dataset)
        data_local_num_dict[client_idx] = local_data_num
        logging.info("client_idx = %d, local_sample_number = %d" %
                     (client_idx, local_data_num))
        logging.info(
            "client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d"
            % (client_idx, len(train_data_local), len(test_data_local)))
        train_data_local_dict[client_idx] = train_data_local
        test_data_local_dict[client_idx] = test_data_local

    # global dataset
    train_data_global = data.DataLoader(data.ConcatDataset(
        list(dl.dataset for dl in list(train_data_local_dict.values()))),
                                        batch_size=batch_size,
                                        shuffle=True)
    train_data_num = len(train_data_global.dataset)

    test_data_global = data.DataLoader(data.ConcatDataset(
        list(dl.dataset for dl in list(test_data_local_dict.values())
             if dl is not None)),
                                       batch_size=batch_size,
                                       shuffle=True)
    test_data_num = len(test_data_global.dataset)

    VOCAB_LEN = len(utils.get_word_dict()) + 1
    return DEFAULT_TRAIN_CLIENTS_NUM, train_data_num, test_data_num, train_data_global, test_data_global, \
        data_local_num_dict, train_data_local_dict, test_data_local_dict, VOCAB_LEN
Пример #12
0
    def __init__(self, dataset_folder, fields, split=None,
                 classes=None, no_except=True, transform=None):
        # Read metadata file
        metadata_file = os.path.join(dataset_folder, 'metadata.yaml')
        if os.path.exists(metadata_file):
            with open(metadata_file, 'r') as f:
                metadata = yaml.load(f)
        else:
            metadata = {}

        # If classes is None, use all subfolders
        if classes is None:
            classes = os.listdir(dataset_folder)
            classes = [c for c in classes
                       if os.path.isdir(os.path.join(dataset_folder, c))]

        # Get all sub-datasets
        self.datasets_classes = []
        for c in classes:
            subpath = os.path.join(dataset_folder, c)
            if not os.path.isdir(subpath):
                logger.warning('Class %s does not exist in dataset.' % c)

            metadata_c = metadata.get(c, {'id': c, 'name': 'n/a'})
            dataset = Shapes3dClassDataset(subpath, fields, split,
                                           metadata_c, no_except,
                                           transform=transform)
            self.datasets_classes.append(dataset)

        self._concat_dataset = data.ConcatDataset(self.datasets_classes)
Пример #13
0
 def build_dataset_from_category(self, category, typ, max_elements=None):
     """Build a dataset for all modules in a category"""
     print(f"adding category {category}/../{typ}")
     ds = self._build_datasets_from_category(
         category, typ, max_elements=max_elements
     )
     return data.ConcatDataset(ds)
Пример #14
0
def get_training_dataloader(parameters, include_field, num_fields_list):
    batch_size = parameters['batch_size']
    dataset_list = []
    num_examples = 0

    for current_field in num_fields_list:
        X_name, y_name = '_X_train_', '_y_train_'
        prefix = 'field_' + str(current_field)
        X = np.load(saves_directory + '/' + prefix + X_name +
                    str(num_datapoints) + '.npy')
        y = np.load(saves_directory + '/' + prefix + y_name +
                    str(num_datapoints) + '.npy')
        if include_field == 'd':
            X = X[:, dataset_indices]
        elif include_field == 'f':
            X = X[:, field_indices]
        num_examples += X.shape[0]
        parameters['input_dim'] = X.shape[1]
        X, y = torch.from_numpy(X).float(), torch.from_numpy(y)
        dataset_list.append(data.TensorDataset(X, y))

    print('loaded training dimensions are (' + str(num_examples) + ',' +
          str(parameters['input_dim']) + ')')
    dataset = data.ConcatDataset(dataset_list)
    return data.DataLoader(dataset,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=0)
Пример #15
0
    def __init__(
        self,
        tickers: Tuple[str, ...],
        end: pd.Timestamp,
        params: PhenotypeData,
        params_type: Type[features.DataParams],
    ):
        """Формирует загрузчики данных для обучения, валидации, тестирования и прогнозирования для
        заданных тикеров и конечной даты на основе словаря с параметрами.

        :param tickers:
            Перечень тикеров, для которых будет строится модель.
        :param end:
            Конец диапазона дат статистики, которые будут использоваться для
            построения модели.
        :param params:
            Словарь с параметрами для построения признаков и других элементов модели.
        :param params_type:
            Тип формируемых признаков.
        """
        params = params_type(tickers, end, params)
        data_sets = [OneTickerDataset(ticker, params) for ticker in tickers]
        super().__init__(
            dataset=data.ConcatDataset(data_sets),
            batch_size=params.batch_size,
            shuffle=params.shuffle,
            drop_last=False,
            num_workers=
            0,  # Загрузка в отдельном потоке - увеличение потоков не докидывает
        )
        self._features_description = data_sets[0].features_description
        self._history_days = params.history_days
Пример #16
0
 def build_dataset_from_modules(self, category, modules, typ, max_elements=None):
     """Build a dataset from several modules in a category"""
     ds = []
     for module in modules:
         self.dfs[category][module][typ].set_max_elements(max_elements)
         ds.append(self.dfs[category][module][typ])
     return data.ConcatDataset(ds)
Пример #17
0
def get_detection_dataset_dicts(names,
                                filter_empty=True,
                                min_keypoints=0,
                                proposal_files=None):
    """
    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.

    Args:
        names (str or list[str]): a dataset name or a list of dataset names
        filter_empty (bool): whether to filter out images without instance annotations
        min_keypoints (int): filter out images with fewer keypoints than
            `min_keypoints`. Set to 0 to do nothing.
        proposal_files (list[str]): if given, a list of object proposal files
            that match each dataset in `names`.

    Returns:
        list[dict]: a list of dicts following the standard dataset dict format.
    """
    if isinstance(names, str):
        names = [names]
    assert len(names), names
    dataset_dicts = [
        DatasetCatalog.get(dataset_name) for dataset_name in names
    ]
    for dataset_name, dicts in zip(names, dataset_dicts):
        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)

    if proposal_files is not None:
        assert len(names) == len(proposal_files)
        # load precomputed proposals from proposal files
        dataset_dicts = [
            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
            for dataset_i_dicts, proposal_file in zip(dataset_dicts,
                                                      proposal_files)
        ]

    if isinstance(dataset_dicts[0], torchdata.Dataset):
        return torchdata.ConcatDataset(dataset_dicts)

    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))

    has_instances = "annotations" in dataset_dicts[0]
    if filter_empty and has_instances:
        dataset_dicts = filter_images_with_only_crowd_annotations(
            dataset_dicts)
    if min_keypoints > 0 and has_instances:
        dataset_dicts = filter_images_with_few_keypoints(
            dataset_dicts, min_keypoints)

    if has_instances:
        try:
            class_names = MetadataCatalog.get(names[0]).thing_classes
            check_metadata_consistency("thing_classes", names)
            print_instances_class_histogram(dataset_dicts, class_names)
        except AttributeError:  # class names are not available for this dataset
            pass

    assert len(dataset_dicts), "No valid data found in {}.".format(
        ",".join(names))
    return dataset_dicts
Пример #18
0
def get_dataset(out_size, cache_dir, mode):
    datasets_list = []
    if mode == 'train':
        sets_path = [os.path.join(sets_root, sets) for sets in training_sets]
    else:
        sets_path = [os.path.join(sets_root, sets) for sets in testing_sets]
    for set_path in sets_path:
        sub_set_names = os.listdir(set_path)
        for sub_set_name in sub_set_names:
            sub_set_path = os.path.join(set_path, sub_set_name)
            if (os.path.isdir(sub_set_path)) and ('_lmdb' in sub_set_name):
                if mode == 'train':
                    datasets_list.append(
                        LMDBDataset(
                            db_path=sub_set_path,
                            cache_dir=cache_dir,
                            pre_transform=pre_transform(out_size),
                            input_transform_norm=input_transform_norm(
                                out_size),
                            target_transform_norm=target_transform_norm(
                                out_size)))
                else:
                    datasets_list.append(
                        LMDBDataset(
                            db_path=sub_set_path,
                            cache_dir=cache_dir,
                            pre_transform=pre_transform_test(out_size),
                            input_transform_norm=input_transform_norm(
                                out_size),
                            target_transform_norm=target_transform_norm(
                                out_size)))
    return data.ConcatDataset(datasets_list)
Пример #19
0
    def __init__(self,
                 dataset1,
                 dataset2,
                 semantics=None,
                 nc=10,
                 device='cuda'):
        labels = []
        self.domains = [0] * len(dataset1) + [1] * len(dataset2)
        self.dataset = data.ConcatDataset((dataset1, dataset2))
        if semantics:

            print('Infering semantics for dataset1')
            for sample, _ in dataset1:
                sample = sample.to(device)
                sample = (sample.unsqueeze(0) + 1) * 0.5
                label = semantics(sample).argmax(1)
                labels.append(label)
            print('Infering semantics for dataset2')
            for sample, _ in dataset2:
                sample = sample.to(device)
                sample = (sample.unsqueeze(0) + 1) * 0.5
                label = semantics(sample).argmax(1)
                labels.append(label)

            self.labels = torch.LongTensor(labels)
            self.labels_idxs = [
                torch.nonzero(self.labels == label)[:, 0]
                for label in range(nc)
            ]
        else:
            self.labels = torch.LongTensor([0] * len(self.domains))
            self.labels_idxs = [torch.arange(len(self.labels))]
Пример #20
0
def get_dataset(is_training):
    img_path_list = []
    img_name_list = []
    datasets_list = []
    sets_path = get_setspath(is_training)
    print(sets_path)
    labels_path = get_labelspath(is_training)
    wholeimg_path = get_imgpath(is_training)
    transform = get_transform()
    for set_path in sets_path:
        subset_names = os.listdir(set_path)
        for subset_name in subset_names:
            subset_path = os.path.join(set_path, subset_name)
            img_name_list.append(subset_name)
            img_path_list.append(subset_path)

    datasets_list.append(
        ImgDataset(
            img_path=img_path_list,
            img_name=img_name_list,
            transform=transform,
            is_training=is_training,
            label_path=labels_path[0],
            wholeimg_path=wholeimg_path[0]
        )
    )
    return data.ConcatDataset(datasets_list)
Пример #21
0
 def training(self, train_datasets, **kwargs):
     n_epochs = kwargs.get('n_epochs', 1)
     log_freq = kwargs.get('log_freq', 500)
     mini_batch_size = kwargs.get('mini_batch_size')
     if self.training_mode == 'sequential':
         for train_dataset in train_datasets:
             logger.info('Training on {}'.format(
                 train_dataset.__class__.__name__))
             train_dataloader = data.DataLoader(
                 train_dataset,
                 batch_size=mini_batch_size,
                 shuffle=False,
                 collate_fn=datasets.utils.batch_encode)
             self.train(dataloader=train_dataloader,
                        n_epochs=n_epochs,
                        log_freq=log_freq)
     elif self.training_mode == 'multi_task':
         train_dataset = data.ConcatDataset(train_datasets)
         logger.info('Training multi-task model on all datasets')
         train_dataloader = data.DataLoader(
             train_dataset,
             batch_size=mini_batch_size,
             shuffle=True,
             collate_fn=datasets.utils.batch_encode)
         self.train(dataloader=train_dataloader,
                    n_epochs=n_epochs,
                    log_freq=log_freq)
     else:
         raise ValueError('Invalid training mode')
Пример #22
0
def get_motionseg_dataset(config,split):
    dataset_dict={"fbms":fbms_dataset,
                  "fbms-3d":fbms_dataset,
                  "cdnet2014":cdnet_dataset,
                  "segtrackv2":segtrackv2_dataset,
                  "bmcnet":bmcnet_dataset,
                  "davis2016":davis_dataset,
                  'davis2017':davis_dataset}

    normer=image_normalizations(ways='-1,1')
    augmentations = Augmentations()
    key=config.dataset.lower()
    if key in dataset_dict.keys():
        config.root_path=dataset_root_dict[key]
        xxx_dataset=dataset_dict[key](config,split,normalizations=normer,augmentations=augmentations)
    elif key == "all":
        dataset_set=[]
        for d in ['FBMS','cdnet2014','segtrackv2','BMCnet','DAVIS2017','DAVIS2016']:
            config.dataset=d
            dataset_set.append(get_motionseg_dataset(config,split))
        xxx_dataset=td.ConcatDataset(dataset_set)
    else:
        assert False,'dataset must in {} or all'.format(dataset_dict.keys())

    return xxx_dataset
Пример #23
0
def test(mode, model_path, args):
    """ Performs decoding on a test set, and save the best-scoring decoding results.

    """
    assert mode in MODEs, 'Invalid mode!'
    print('mode:', mode)
    print("load test data...")
    if mode == 'cls3':
        with open(DATASET_TEST_CLS3, 'rb') as f:
            dataset_test = pickle.load(f)
    elif mode == 'cls18':
        with open(DATASET_TEST_CLS18, 'rb') as f:
            dataset_test = pickle.load(f)
    else:
        with open(DATASET_TEST_CLS3, 'rb') as f:
            dataset_cls3 = pickle.load(f)
        with open(DATASET_TEST_CLS3, 'rb') as f:
            dataset_cls18 = pickle.load(f)
        dataset_test = data.ConcatDataset([dataset_cls3, dataset_cls18])

    print("load model from {}".format(model_path))
    model = Seq2seq.load(model_path)

    if USE_CUDA:
        print('use device: %s' % device, file=sys.stderr)
        model = model.to(device)
    if GPU_PARALLEL:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model, device_ids=[0, 1])

    if mode in ('summ', 'qa'):
        evaluate_summ_qa(model, dataset_test, mode, batch_size=128)
    else:
        evaluate_cls(model, dataset_test, mode, batch_size=512)
Пример #24
0
    def training(self, datasets, **kwargs):
        train_datasets = datasets_dict(datasets["train"], datasets["order"])
        val_datasets = datasets_dict(datasets["val"], datasets["order"])
        self.relearning_task_dataset = {
            self.relearning_task: val_datasets[self.relearning_task]
        }

        self.dataloaders = {
            self.relearning_task:
            data.DataLoader(train_datasets[self.relearning_task],
                            batch_size=self.mini_batch_size,
                            shuffle=True),
            # for now, pi;e all other tasks on one stack
            OTHER_TASKS:
            data.DataLoader(data.ConcatDataset([
                dataset for task, dataset in train_datasets.items()
                if task != self.relearning_task
            ]),
                            batch_size=self.mini_batch_size,
                            shuffle=True)
        }
        self.metrics[self.relearning_task]["performance"].append([])
        # write performance of initial encounter (before training) to metrics
        self.metrics[self.relearning_task]["performance"][0].append(
            self.validate(self.relearning_task_dataset,
                          log=False,
                          n_samples=self.config.training.n_validation_samples)[
                              self.relearning_task])
        self.metrics[
            self.relearning_task]["performance"][0][0]["examples_seen"] = 0
        # first encounter relearning task
        self.train(dataloader=self.dataloaders[self.relearning_task],
                   datasets=datasets)
Пример #25
0
Файл: main.py Проект: ksurya/exp
def main():
    print("building vocab")
    vocab = Vocabulary()
    tokenizer = create_tokenizer()
    indexer = create_token_indexer(vocab, update_vocab=True)
    mini_batcher = create_mini_batcher(padding=vocab.PAD)

    ds1 = Reviews("data/amazon_cells_labelled.txt", tokenizer, indexer)
    ds2 = Reviews("data/imdb_labelled.txt", tokenizer, indexer)
    ds3 = Reviews("data/yelp_labelled.txt", tokenizer, indexer)
    ds = D.ConcatDataset([ds1, ds2, ds3])
    train_ds, test_ds = train_test_split(ds, 0.8)

    print("getting glove weights")
    embed_weights = torch.tensor(get_embedding_weights(vocab)).float().to(0)

    print("loading model")
    model = Classifier(embed_weights=embed_weights, )

    print("starting training..")
    train_and_test(
        train_ds=train_ds,
        test_ds=test_ds,
        mini_batcher=mini_batcher,
        batch_size=32,
        lr=0.001,
        epochs=1000,
        model=model,
    )
Пример #26
0
def create_dataset_multi(dsnames,
                         transform,
                         num_samples=None,
                         indices=None,
                         train=False,
                         repeat_factor=None):
    assert num_samples is None or indices is None, "num_sample and indices can not both be defines"
    if indices is None:
        try:
            indices = range(num_samples)
        except TypeError:
            indices = None

    datasets_for_phase = []
    for name in dsnames:
        ds = create_dataset(name, train, transform, indices, repeat_factor)
        datasets_for_phase.append(ds)

    is_single_dataset = isinstance(dsnames, str) or len(dsnames) == 1

    if is_single_dataset:
        dataset = datasets_for_phase[0]
    else:
        dataset = td.ConcatDataset(datasets_for_phase)
    print(dataset)
    return dataset
Пример #27
0
    def get(self,
            language=None,
            split=None,
            chunk_slice=None,
            force_lazy=False):
        def is_needed(path: CodeSearchDataPath):
            cond = True
            if language is not None:
                cond = cond and path.lang == language
            if split is not None:
                cond = cond and path.split == split
            if chunk_slice is not None:
                if isinstance(chunk_slice, int):
                    cond = cond and path.chunk_num == chunk_slice
                elif isinstance(chunk_slice, slice):
                    cond = cond and path.chunk_num in range(
                        *chunk_slice.indices(len(self.pool.dataset_paths)))
                else:
                    raise ValueError(f"invalid chunk index {chunk_slice}")
            return cond

        needed_paths = [
            path for path in self.pool.dataset_paths if is_needed(path)
        ]
        needed_paths.sort()
        lazy_dataset_list = [
            LazyLoadCodeSearchChunk(path, self.pool) for path in needed_paths
        ]
        if len(needed_paths) <= self.pool.max_cache and not force_lazy:
            self.pool.warm_up(needed_paths)
        dataset = dt.ConcatDataset(lazy_dataset_list)
        return dataset
Пример #28
0
 def __init__(self, videos, batch_size, shuffle=False, num_workers=0):
     self.videos = videos
     self.batch_size = batch_size
     self.shuffle = shuffle
     self.dataset = data.ConcatDataset(videos)
     self.data_loader = data.DataLoader(self.dataset, batch_size, shuffle,
                                        pin_memory=True, num_workers=num_workers)
Пример #29
0
 def __init__(self,
              root,
              domain,
              partition,
              num_points=1024,
              sampling_method='random',
              download=False,
              test_cnt=1,
              transforms=None):
     if download:
         self.download(root)
     self.domain = domain
     self.num_points = num_points
     self.partition = partition
     if partition in ['train', 'trainval']:
         self.dataset = DOMAIN_DATASET[domain](root=root,
                                               partition=partition,
                                               seed=None)
         self.targets = self.dataset.targets
     elif partition in ['test', 'val']:
         self.dataset = data.ConcatDataset([
             DOMAIN_DATASET[domain](root=root,
                                    partition=partition,
                                    seed=seed) for seed in range(test_cnt)
         ])
         self.targets = []
         for seed in range(test_cnt):
             self.targets.extend(DOMAIN_DATASET[domain](root=root,
                                                        partition=partition,
                                                        seed=seed).targets)
     else:
         raise NotImplementedError
     self.sampling_method = sampling_method
     self.transforms = transforms
Пример #30
0
 def training(self, train_datasets, **kwargs):
     n_epochs = kwargs.get('n_epochs', 1)
     log_freq = kwargs.get('log_freq', 20)
     mini_batch_size = kwargs.get('mini_batch_size')
     if self.training_mode == 'sequential':
         for cluster_idx, train_dataset in enumerate(train_datasets):
             logger.info('Training on cluster {}'.format(cluster_idx + 1))
             train_dataloader = data.DataLoader(
                 train_dataset,
                 batch_size=mini_batch_size,
                 shuffle=False,
                 collate_fn=datasets.utils.rel_encode)
             self.train(dataloader=train_dataloader,
                        n_epochs=n_epochs,
                        log_freq=log_freq)
     elif self.training_mode == 'multi_task':
         train_dataset = data.ConcatDataset(train_datasets)
         logger.info('Training multi-task model on all datasets')
         train_dataloader = data.DataLoader(
             train_dataset,
             batch_size=mini_batch_size,
             shuffle=True,
             collate_fn=datasets.utils.rel_encode)
         self.train(dataloader=train_dataloader,
                    n_epochs=n_epochs,
                    log_freq=log_freq)