def split_train_test(self, data_files, limit=None): test = [] train = [] for f in data_files: if self.format == 'csv': temp = ChessDataset(f) else: temp = LMDBChessDataset(f) if limit: test_len = round(limit * self.test_ratio) else: test_len = round(len(temp) * self.test_ratio) del temp if self.format == 'csv': test.append(ChessDataset(f, limit=test_len)) train.append(ChessDataset(f, limit=limit, offset=test_len)) elif self.format == 'lmdb': test.append(LMDBChessDataset(f, limit=test_len)) train.append(LMDBChessDataset(f, limit=limit, offset=test_len)) if len(train) == 1: train_dataset = train[0] elif len(train) == 2: train_dataset = InterleavenDataset(train) else: train_dataset = data.ConcatDataset(train) return data.DataLoader( train_dataset, batch_size=self.batch_size, num_workers=4, shuffle=(self.format == 'csv')), data.DataLoader( data.ConcatDataset(test), batch_size=self.batch_size, )
def load_mnist_plus_fmnist(args, **kwargs): args.input_size = [1, 28, 28] args.input_type = 'gray' args.dynamic_binarization = False import torch.utils.data as data_utils train_loader, val_loader, test_loader, _ = load_dynamic_mnist(args) train_loader2, val_loader2, test_loader2, _ = load_fashion_mnist(args, label_offset=10) train_dataset = data_utils.ConcatDataset([train_loader.dataset, train_loader2.dataset]) val_dataset = data_utils.ConcatDataset([val_loader.dataset, val_loader2.dataset]) test_dataset = data_utils.ConcatDataset([test_loader.dataset, test_loader2.dataset]) shuffle = True kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = data_utils.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=shuffle, **kwargs) val_loader = data_utils.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=shuffle, **kwargs) test_loader = data_utils.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=shuffle, **kwargs) return train_loader, val_loader, test_loader, args
def prepare_data_seq(task, batch_size=100, shuffle=True): path_train = 'data/chitchat/data_mixup/train_' path_dev = 'data/chitchat/data_mixup/dev_' # provide the data mixup rates dict_file_path = "data/chitchat/vocab_pool/vocab_filter1000_18394.txt" lang = Lang(dict_file_path) train_datasets = [] dev_datasets = [] for file_name, weight in SRC_WEIGHTS.items(): train_file = path_train + file_name dev_file = path_dev + file_name train_sub_dat = SubDataset(lang, train_file, weight, max_len) dev_sub_dat = SubDataset(lang, dev_file, weight, max_len) train_datasets.append(train_sub_dat) dev_datasets.append(dev_sub_dat) train_datasets = data.ConcatDataset(train_datasets) dev_datasets = data.ConcatDataset(dev_datasets) train = get_seq(train_datasets, lang, batch_size, True, max_len) dev = get_seq(dev_datasets, lang, batch_size, False, max_len) test = None logging.info("Read %s sentence pairs train" % train.__len__()) logging.info("Read %s sentence pairs dev" % dev.__len__()) logging.info("Read %s sentence pairs test" % 0) logging.info("Max len Input %s " % max_len) logging.info("Vocab_size %s " % lang.n_words) logging.info("USE_CUDA={}".format(USE_CUDA)) return train, dev, test, [], lang, max_len, max_r
def dataset_augmentation(dataset_old, scale = 10): tot_len = len(dataset_old) print(tot_len) dataset = data_utils.ConcatDataset([dataset_old]) for i in range(scale - 1): datasets = data_utils.random_split(dataset_old, [int(tot_len/2), tot_len - int(tot_len/2)]) dataset = data_utils.ConcatDataset([dataset] + datasets) print(len(dataset)) return dataset
def load_partition_data_federated_emnist(dataset, data_dir, batch_size=DEFAULT_BATCH_SIZE): # client ids train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE) test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE) print(train_file_path) with h5py.File(train_file_path, 'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5: global client_ids_train, client_ids_test client_ids_train = list(train_h5[_EXAMPLE].keys()) client_ids_test = list(test_h5[_EXAMPLE].keys()) # local dataset data_local_num_dict = dict() train_data_local_dict = dict() test_data_local_dict = dict() for client_idx in range(DEFAULT_TRAIN_CLIENTS_NUM): train_data_local, test_data_local = get_dataloader( dataset, data_dir, batch_size, batch_size, client_idx) local_data_num = len(train_data_local) + len(test_data_local) data_local_num_dict[client_idx] = local_data_num # logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( # client_idx, len(train_data_local), len(test_data_local))) train_data_local_dict[client_idx] = train_data_local test_data_local_dict[client_idx] = test_data_local # global dataset train_data_global = data.DataLoader(data.ConcatDataset( list(dl.dataset for dl in list(train_data_local_dict.values()))), batch_size=batch_size, shuffle=True) train_data_num = len(train_data_global.dataset) test_data_global = data.DataLoader(data.ConcatDataset( list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None)), batch_size=batch_size, shuffle=True) test_data_num = len(test_data_global.dataset) # class number train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE) with h5py.File(train_file_path, 'r') as train_h5: class_num = len( np.unique([ train_h5[_EXAMPLE][client_ids_train[idx]][_LABEL][0] for idx in range(DEFAULT_TRAIN_CLIENTS_NUM) ])) logging.info("class_num = %d" % class_num) return DEFAULT_TRAIN_CLIENTS_NUM, train_data_num, test_data_num, train_data_global, test_data_global, \ data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
def forward(self, x): x = F.relu(self.conv1_1(x)) # conv1_1 & relu1_1 x = F.relu(self.conv1_2(x)) # conv1_2 & relu1_2 x = F.max_pool2d(x, 2) # pool1 x = F.relu(self.conv2_1(x)) # conv2_1 & relu2_1 x = F.relu(self.conv2_2(x)) # conv2_2 & relu2_2 x = F.max_pool2d(x, 2) # pool2 x = F.relu(self.conv3_1(x)) # relu3_1 x = F.relu(self.conv3_2(x)) # relu3_2 x = F.relu(self.conv3_3(x)) # relu3_3 x = F.max_pool2d(x, 2) # pool3 x = F.relu(self.conv4_1(x)) # relu4_1 x = F.relu(self.conv4_2(x)) # relu4_2 x = F.relu(self.conv4_3(x)) # relu4_3 x = F.max_pool2d(x, 2) # pool4 # conv4_3_norm=F.batch_norm(x,)#conv4_3_norm x = F.relu(self.conv5_1(x)) # relu5_1 x = F.relu(self.conv5_2(x)) # relu5_2 x = F.relu(self.conv5_3(x)) # relu5_3 x = F.max_pool2d(x, 3, stride=1, padding=1) # pool5 x = F.relu(self.conv_fc6(x)) # relu6 x = F.relu(self.conv_fc7(x)) # relu7 x = F.relu(self.conv6_1(x)) # relu6_1 x = F.relu(self.conv6_2(x)) # relu6_2 x = F.relu(self.conv7_1(x)) # relu7_1 # conv6_2_conf conv6_2_conf = self.conv6_2_conf(x).permute(2, 0, 1) conv6_2_conf = conv6_2_conf.view(-1, len(conv6_2_conf)) # conv6_2_loc conv6_2_loc = self.conv6_2_loc(x).permute(2, 0, 1) conv6_2_loc = conv6_2_loc.view(-1, len(conv6_2_loc)) x = F.relu(self.conv7_2(x)) # relu7_2 # conv7_2_conf conv7_2_conf = self.conv7_2_conf(x).permute(2, 0, 1) conv7_2_conf = conv7_2_conf.view(-1, len(conv7_2_conf)) # conv7_2_loc conv7_2_loc = self.conv7_2_loc(x).permute(2, 0, 1) conv7_2_loc = conv7_2_loc.view(-1, len(conv7_2_loc)) x = F.relu(self.conv8_1(x)) # relu8_1 x = F.relu(self.conv8_2(x)) # relu8_2 # conv8_2_conf conv8_2_conf = self.conv8_2_conf(x).permute(2, 0, 1) conv8_2_conf = conv8_2_conf.view(-1, len(conv8_2_conf)) # conv8_2_loc conv8_2_loc = self.conv8_2_loc(x).permute(2, 0, 1) conv8_2_loc = conv8_2_loc.view(-1, len(conv8_2_loc)) x = F.relu(self.conv9_1(x)) # relu9_1 x = F.relu(self.conv9_2(x)) # relu9_2 # conv9_2_conf conv9_2_conf = self.conv9_2_conf(x).permute(2, 0, 1) conv9_2_conf = conv9_2_conf.view(-1, len(conv9_2_conf)) # conv9_2_loc conv9_2_loc = self.conv9_2_loc(x).permute(2, 0, 1) conv9_2_loc = conv9_2_loc.view(-1, len(conv9_2_loc)) loc = td.ConcatDataset([conv6_2_loc, conv7_2_loc, conv8_2_loc, conv9_2_loc]) # Concat mbox_loc conf = td.ConcatDataset([conv6_2_conf, conv7_2_conf, conv8_2_conf, conv9_2_conf]) # Concat mbox_conf_loc
def get_concat_dataloader(data_root, batch_size=64, download=False): transforms_train = transforms.Compose([ transforms.Resize(size=224), transforms.RandomCrop(size=(224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) transforms_val = transforms.Compose([ transforms.Resize(size=224), transforms.CenterCrop(size=(224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) cub_root = os.path.join(data_root, 'cub200') train_cub = CUB200(root=cub_root, split='train', transforms=transforms_train, download=download, offset=0) val_cub = CUB200(root=cub_root, split='test', transforms=transforms_val, download=False, offset=0) dogs_root = os.path.join(data_root, 'dogs') train_dogs = StanfordDogs(root=dogs_root, split='train', transforms=transforms_train, download=download, offset=200) val_dogs = StanfordDogs(root=dogs_root, split='test', transforms=transforms_val, download=False, offset=200) # add offset train_dst = data.ConcatDataset([train_cub, train_dogs]) val_dst = data.ConcatDataset([val_cub, val_dogs]) train_loader = data.DataLoader(train_dst, batch_size=batch_size, drop_last=True, shuffle=True, num_workers=4) val_loader = data.DataLoader(val_dst, batch_size=batch_size, drop_last=True, shuffle=False, num_workers=4) return train_loader, val_loader
def setup(self, stage): self.train_data = data.ConcatDataset([ torchaudio.datasets.LIBRISPEECH(self.hparams.data_root, url=path, download=True) for path in self.hparams.data_train ]) self.test_data = data.ConcatDataset([ torchaudio.datasets.LIBRISPEECH(self.hparams.data_root, url=path, download=True) for path in self.hparams.data_test ])
def load_partition_data_federated_cifar100(data_dir, batch_size=DEFAULT_BATCH_SIZE): class_num = 100 # client id list train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE) test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE) with h5py.File(train_file_path, 'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5: global client_ids_train, client_ids_test client_ids_train = list(train_h5[_EXAMPLE].keys()) client_ids_test = list(test_h5[_EXAMPLE].keys()) # get local dataset data_local_num_dict = dict() train_data_local_dict = dict() test_data_local_dict = dict() for client_idx in range(DEFAULT_TRAIN_CLINETS_NUM): train_data_local, test_data_local = get_dataloader( data_dir, batch_size, batch_size, client_idx) local_data_num = len(train_data_local.dataset) data_local_num_dict[client_idx] = local_data_num # logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) # logging.info("client_idx = %d, batch_num_train_local = %d" % (client_idx, len(train_data_local))) train_data_local_dict[client_idx] = train_data_local test_data_local_dict[client_idx] = test_data_local # global dataset # train_data_global = data.DataLoader( # data.ConcatDataset( # list(dl.dataset for dl in list(train_data_local_dict.values())) # ), # batch_size=batch_size, shuffle=True) # train_data_num = len(train_data_global.dataset) # test_data_global = data.DataLoader( # data.ConcatDataset( # list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None) # ), # batch_size=batch_size, shuffle=True) # test_data_num = len(test_data_global.dataset) train_data_global = data.ConcatDataset( list(dl.dataset for dl in list(train_data_local_dict.values())) ) test_data_global = data.ConcatDataset( list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None)) # DEFAULT_TRAIN_CLINETS_NUM = 500, train_data_num = 50000, test_data_num = 10000 # train_data_global=5만개 datalodaer, test_data_globa = 1만개 dataloader -> dataset필요시. train_data_global.dataset # data_local_num_dict = client data개수 사전, train_data_local_dcit = client dataloader dict, return train_data_global, test_data_global, train_data_local_dict,
def get_dataloader(dataset, data_dir, train_bs, test_bs, client_idx=None): if client_idx is None: train_dl = data.DataLoader(data.ConcatDataset( StackOverflowDataset( os.path.join(data_dir, DEFAULT_TRAIN_FILE), client_idx, "train", { "input": lambda x: utils.preprocess_input(x, data_dir), "target": lambda y: utils.preprocess_target(y, data_dir) }) for client_idx in range(DEFAULT_TRAIN_CLINETS_NUM)), batch_size=batch_size, shuffle=True) test_dl = data.DataLoader(data.ConcatDataset( StackOverflowDataset( os.path.join(data_dir, DEFAULT_TEST_FILE), client_idx, "test", { "input": lambda x: utils.preprocess_input(x, data_dir), "target": lambda y: utils.preprocess_target(y, data_dir) }) for client_idx in range(DEFAULT_TEST_CLINETS_NUM)), batch_size=batch_size, shuffle=True) return train_dl, test_dl else: train_ds = StackOverflowDataset( os.path.join(data_dir, DEFAULT_TRAIN_FILE), client_idx, "train", { "input": lambda x: utils.preprocess_input(x, data_dir), "target": lambda y: utils.preprocess_target(y, data_dir) }) train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) if client_idx >= DEFAULT_TEST_CLIENTS_NUM: test_dl = None else: test_ds = StackOverflowDataset( os.path.join(data_dir, DEFAULT_TEST_FILE), client_idx, "test", { "input": lambda x: utils.preprocess_input(x, data_dir), "target": lambda y: utils.preprocess_target(y, data_dir) }) test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=True, drop_last=False) return train_dl, test_dl
def load_partition_data_federated_shakespeare(dataset, data_dir, batch_size=DEFAULT_BATCH_SIZE): #client id list train_file_path = os.path.join(data_dir, DEFAULT_TRAIN_FILE) test_file_path = os.path.join(data_dir, DEFAULT_TEST_FILE) with h5py.File(train_file_path, 'r') as train_h5, h5py.File(test_file_path, 'r') as test_h5: global client_ids_train, client_ids_test client_ids_train = list(train_h5[_EXAMPLE].keys()) client_ids_test = list(test_h5[_EXAMPLE].keys()) # get local dataset data_local_num_dict = dict() train_data_local_dict = dict() test_data_local_dict = dict() for client_idx in range(DEFAULT_TRAIN_CLIENTS_NUM): train_data_local, test_data_local = get_dataloader( dataset, data_dir, batch_size, batch_size, client_idx) local_data_num = len(train_data_local.dataset) data_local_num_dict[client_idx] = local_data_num logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) logging.info( "client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (client_idx, len(train_data_local), len(test_data_local))) train_data_local_dict[client_idx] = train_data_local test_data_local_dict[client_idx] = test_data_local # global dataset train_data_global = data.DataLoader(data.ConcatDataset( list(dl.dataset for dl in list(train_data_local_dict.values()))), batch_size=batch_size, shuffle=True) train_data_num = len(train_data_global.dataset) test_data_global = data.DataLoader(data.ConcatDataset( list(dl.dataset for dl in list(test_data_local_dict.values()) if dl is not None)), batch_size=batch_size, shuffle=True) test_data_num = len(test_data_global.dataset) VOCAB_LEN = len(utils.get_word_dict()) + 1 return DEFAULT_TRAIN_CLIENTS_NUM, train_data_num, test_data_num, train_data_global, test_data_global, \ data_local_num_dict, train_data_local_dict, test_data_local_dict, VOCAB_LEN
def __init__(self, dataset_folder, fields, split=None, classes=None, no_except=True, transform=None): # Read metadata file metadata_file = os.path.join(dataset_folder, 'metadata.yaml') if os.path.exists(metadata_file): with open(metadata_file, 'r') as f: metadata = yaml.load(f) else: metadata = {} # If classes is None, use all subfolders if classes is None: classes = os.listdir(dataset_folder) classes = [c for c in classes if os.path.isdir(os.path.join(dataset_folder, c))] # Get all sub-datasets self.datasets_classes = [] for c in classes: subpath = os.path.join(dataset_folder, c) if not os.path.isdir(subpath): logger.warning('Class %s does not exist in dataset.' % c) metadata_c = metadata.get(c, {'id': c, 'name': 'n/a'}) dataset = Shapes3dClassDataset(subpath, fields, split, metadata_c, no_except, transform=transform) self.datasets_classes.append(dataset) self._concat_dataset = data.ConcatDataset(self.datasets_classes)
def build_dataset_from_category(self, category, typ, max_elements=None): """Build a dataset for all modules in a category""" print(f"adding category {category}/../{typ}") ds = self._build_datasets_from_category( category, typ, max_elements=max_elements ) return data.ConcatDataset(ds)
def get_training_dataloader(parameters, include_field, num_fields_list): batch_size = parameters['batch_size'] dataset_list = [] num_examples = 0 for current_field in num_fields_list: X_name, y_name = '_X_train_', '_y_train_' prefix = 'field_' + str(current_field) X = np.load(saves_directory + '/' + prefix + X_name + str(num_datapoints) + '.npy') y = np.load(saves_directory + '/' + prefix + y_name + str(num_datapoints) + '.npy') if include_field == 'd': X = X[:, dataset_indices] elif include_field == 'f': X = X[:, field_indices] num_examples += X.shape[0] parameters['input_dim'] = X.shape[1] X, y = torch.from_numpy(X).float(), torch.from_numpy(y) dataset_list.append(data.TensorDataset(X, y)) print('loaded training dimensions are (' + str(num_examples) + ',' + str(parameters['input_dim']) + ')') dataset = data.ConcatDataset(dataset_list) return data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
def __init__( self, tickers: Tuple[str, ...], end: pd.Timestamp, params: PhenotypeData, params_type: Type[features.DataParams], ): """Формирует загрузчики данных для обучения, валидации, тестирования и прогнозирования для заданных тикеров и конечной даты на основе словаря с параметрами. :param tickers: Перечень тикеров, для которых будет строится модель. :param end: Конец диапазона дат статистики, которые будут использоваться для построения модели. :param params: Словарь с параметрами для построения признаков и других элементов модели. :param params_type: Тип формируемых признаков. """ params = params_type(tickers, end, params) data_sets = [OneTickerDataset(ticker, params) for ticker in tickers] super().__init__( dataset=data.ConcatDataset(data_sets), batch_size=params.batch_size, shuffle=params.shuffle, drop_last=False, num_workers= 0, # Загрузка в отдельном потоке - увеличение потоков не докидывает ) self._features_description = data_sets[0].features_description self._history_days = params.history_days
def build_dataset_from_modules(self, category, modules, typ, max_elements=None): """Build a dataset from several modules in a category""" ds = [] for module in modules: self.dfs[category][module][typ].set_max_elements(max_elements) ds.append(self.dfs[category][module][typ]) return data.ConcatDataset(ds)
def get_detection_dataset_dicts(names, filter_empty=True, min_keypoints=0, proposal_files=None): """ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. Args: names (str or list[str]): a dataset name or a list of dataset names filter_empty (bool): whether to filter out images without instance annotations min_keypoints (int): filter out images with fewer keypoints than `min_keypoints`. Set to 0 to do nothing. proposal_files (list[str]): if given, a list of object proposal files that match each dataset in `names`. Returns: list[dict]: a list of dicts following the standard dataset dict format. """ if isinstance(names, str): names = [names] assert len(names), names dataset_dicts = [ DatasetCatalog.get(dataset_name) for dataset_name in names ] for dataset_name, dicts in zip(names, dataset_dicts): assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) if proposal_files is not None: assert len(names) == len(proposal_files) # load precomputed proposals from proposal files dataset_dicts = [ load_proposals_into_dataset(dataset_i_dicts, proposal_file) for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) ] if isinstance(dataset_dicts[0], torchdata.Dataset): return torchdata.ConcatDataset(dataset_dicts) dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) has_instances = "annotations" in dataset_dicts[0] if filter_empty and has_instances: dataset_dicts = filter_images_with_only_crowd_annotations( dataset_dicts) if min_keypoints > 0 and has_instances: dataset_dicts = filter_images_with_few_keypoints( dataset_dicts, min_keypoints) if has_instances: try: class_names = MetadataCatalog.get(names[0]).thing_classes check_metadata_consistency("thing_classes", names) print_instances_class_histogram(dataset_dicts, class_names) except AttributeError: # class names are not available for this dataset pass assert len(dataset_dicts), "No valid data found in {}.".format( ",".join(names)) return dataset_dicts
def get_dataset(out_size, cache_dir, mode): datasets_list = [] if mode == 'train': sets_path = [os.path.join(sets_root, sets) for sets in training_sets] else: sets_path = [os.path.join(sets_root, sets) for sets in testing_sets] for set_path in sets_path: sub_set_names = os.listdir(set_path) for sub_set_name in sub_set_names: sub_set_path = os.path.join(set_path, sub_set_name) if (os.path.isdir(sub_set_path)) and ('_lmdb' in sub_set_name): if mode == 'train': datasets_list.append( LMDBDataset( db_path=sub_set_path, cache_dir=cache_dir, pre_transform=pre_transform(out_size), input_transform_norm=input_transform_norm( out_size), target_transform_norm=target_transform_norm( out_size))) else: datasets_list.append( LMDBDataset( db_path=sub_set_path, cache_dir=cache_dir, pre_transform=pre_transform_test(out_size), input_transform_norm=input_transform_norm( out_size), target_transform_norm=target_transform_norm( out_size))) return data.ConcatDataset(datasets_list)
def __init__(self, dataset1, dataset2, semantics=None, nc=10, device='cuda'): labels = [] self.domains = [0] * len(dataset1) + [1] * len(dataset2) self.dataset = data.ConcatDataset((dataset1, dataset2)) if semantics: print('Infering semantics for dataset1') for sample, _ in dataset1: sample = sample.to(device) sample = (sample.unsqueeze(0) + 1) * 0.5 label = semantics(sample).argmax(1) labels.append(label) print('Infering semantics for dataset2') for sample, _ in dataset2: sample = sample.to(device) sample = (sample.unsqueeze(0) + 1) * 0.5 label = semantics(sample).argmax(1) labels.append(label) self.labels = torch.LongTensor(labels) self.labels_idxs = [ torch.nonzero(self.labels == label)[:, 0] for label in range(nc) ] else: self.labels = torch.LongTensor([0] * len(self.domains)) self.labels_idxs = [torch.arange(len(self.labels))]
def get_dataset(is_training): img_path_list = [] img_name_list = [] datasets_list = [] sets_path = get_setspath(is_training) print(sets_path) labels_path = get_labelspath(is_training) wholeimg_path = get_imgpath(is_training) transform = get_transform() for set_path in sets_path: subset_names = os.listdir(set_path) for subset_name in subset_names: subset_path = os.path.join(set_path, subset_name) img_name_list.append(subset_name) img_path_list.append(subset_path) datasets_list.append( ImgDataset( img_path=img_path_list, img_name=img_name_list, transform=transform, is_training=is_training, label_path=labels_path[0], wholeimg_path=wholeimg_path[0] ) ) return data.ConcatDataset(datasets_list)
def training(self, train_datasets, **kwargs): n_epochs = kwargs.get('n_epochs', 1) log_freq = kwargs.get('log_freq', 500) mini_batch_size = kwargs.get('mini_batch_size') if self.training_mode == 'sequential': for train_dataset in train_datasets: logger.info('Training on {}'.format( train_dataset.__class__.__name__)) train_dataloader = data.DataLoader( train_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.batch_encode) self.train(dataloader=train_dataloader, n_epochs=n_epochs, log_freq=log_freq) elif self.training_mode == 'multi_task': train_dataset = data.ConcatDataset(train_datasets) logger.info('Training multi-task model on all datasets') train_dataloader = data.DataLoader( train_dataset, batch_size=mini_batch_size, shuffle=True, collate_fn=datasets.utils.batch_encode) self.train(dataloader=train_dataloader, n_epochs=n_epochs, log_freq=log_freq) else: raise ValueError('Invalid training mode')
def get_motionseg_dataset(config,split): dataset_dict={"fbms":fbms_dataset, "fbms-3d":fbms_dataset, "cdnet2014":cdnet_dataset, "segtrackv2":segtrackv2_dataset, "bmcnet":bmcnet_dataset, "davis2016":davis_dataset, 'davis2017':davis_dataset} normer=image_normalizations(ways='-1,1') augmentations = Augmentations() key=config.dataset.lower() if key in dataset_dict.keys(): config.root_path=dataset_root_dict[key] xxx_dataset=dataset_dict[key](config,split,normalizations=normer,augmentations=augmentations) elif key == "all": dataset_set=[] for d in ['FBMS','cdnet2014','segtrackv2','BMCnet','DAVIS2017','DAVIS2016']: config.dataset=d dataset_set.append(get_motionseg_dataset(config,split)) xxx_dataset=td.ConcatDataset(dataset_set) else: assert False,'dataset must in {} or all'.format(dataset_dict.keys()) return xxx_dataset
def test(mode, model_path, args): """ Performs decoding on a test set, and save the best-scoring decoding results. """ assert mode in MODEs, 'Invalid mode!' print('mode:', mode) print("load test data...") if mode == 'cls3': with open(DATASET_TEST_CLS3, 'rb') as f: dataset_test = pickle.load(f) elif mode == 'cls18': with open(DATASET_TEST_CLS18, 'rb') as f: dataset_test = pickle.load(f) else: with open(DATASET_TEST_CLS3, 'rb') as f: dataset_cls3 = pickle.load(f) with open(DATASET_TEST_CLS3, 'rb') as f: dataset_cls18 = pickle.load(f) dataset_test = data.ConcatDataset([dataset_cls3, dataset_cls18]) print("load model from {}".format(model_path)) model = Seq2seq.load(model_path) if USE_CUDA: print('use device: %s' % device, file=sys.stderr) model = model.to(device) if GPU_PARALLEL: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.DataParallel(model, device_ids=[0, 1]) if mode in ('summ', 'qa'): evaluate_summ_qa(model, dataset_test, mode, batch_size=128) else: evaluate_cls(model, dataset_test, mode, batch_size=512)
def training(self, datasets, **kwargs): train_datasets = datasets_dict(datasets["train"], datasets["order"]) val_datasets = datasets_dict(datasets["val"], datasets["order"]) self.relearning_task_dataset = { self.relearning_task: val_datasets[self.relearning_task] } self.dataloaders = { self.relearning_task: data.DataLoader(train_datasets[self.relearning_task], batch_size=self.mini_batch_size, shuffle=True), # for now, pi;e all other tasks on one stack OTHER_TASKS: data.DataLoader(data.ConcatDataset([ dataset for task, dataset in train_datasets.items() if task != self.relearning_task ]), batch_size=self.mini_batch_size, shuffle=True) } self.metrics[self.relearning_task]["performance"].append([]) # write performance of initial encounter (before training) to metrics self.metrics[self.relearning_task]["performance"][0].append( self.validate(self.relearning_task_dataset, log=False, n_samples=self.config.training.n_validation_samples)[ self.relearning_task]) self.metrics[ self.relearning_task]["performance"][0][0]["examples_seen"] = 0 # first encounter relearning task self.train(dataloader=self.dataloaders[self.relearning_task], datasets=datasets)
def main(): print("building vocab") vocab = Vocabulary() tokenizer = create_tokenizer() indexer = create_token_indexer(vocab, update_vocab=True) mini_batcher = create_mini_batcher(padding=vocab.PAD) ds1 = Reviews("data/amazon_cells_labelled.txt", tokenizer, indexer) ds2 = Reviews("data/imdb_labelled.txt", tokenizer, indexer) ds3 = Reviews("data/yelp_labelled.txt", tokenizer, indexer) ds = D.ConcatDataset([ds1, ds2, ds3]) train_ds, test_ds = train_test_split(ds, 0.8) print("getting glove weights") embed_weights = torch.tensor(get_embedding_weights(vocab)).float().to(0) print("loading model") model = Classifier(embed_weights=embed_weights, ) print("starting training..") train_and_test( train_ds=train_ds, test_ds=test_ds, mini_batcher=mini_batcher, batch_size=32, lr=0.001, epochs=1000, model=model, )
def create_dataset_multi(dsnames, transform, num_samples=None, indices=None, train=False, repeat_factor=None): assert num_samples is None or indices is None, "num_sample and indices can not both be defines" if indices is None: try: indices = range(num_samples) except TypeError: indices = None datasets_for_phase = [] for name in dsnames: ds = create_dataset(name, train, transform, indices, repeat_factor) datasets_for_phase.append(ds) is_single_dataset = isinstance(dsnames, str) or len(dsnames) == 1 if is_single_dataset: dataset = datasets_for_phase[0] else: dataset = td.ConcatDataset(datasets_for_phase) print(dataset) return dataset
def get(self, language=None, split=None, chunk_slice=None, force_lazy=False): def is_needed(path: CodeSearchDataPath): cond = True if language is not None: cond = cond and path.lang == language if split is not None: cond = cond and path.split == split if chunk_slice is not None: if isinstance(chunk_slice, int): cond = cond and path.chunk_num == chunk_slice elif isinstance(chunk_slice, slice): cond = cond and path.chunk_num in range( *chunk_slice.indices(len(self.pool.dataset_paths))) else: raise ValueError(f"invalid chunk index {chunk_slice}") return cond needed_paths = [ path for path in self.pool.dataset_paths if is_needed(path) ] needed_paths.sort() lazy_dataset_list = [ LazyLoadCodeSearchChunk(path, self.pool) for path in needed_paths ] if len(needed_paths) <= self.pool.max_cache and not force_lazy: self.pool.warm_up(needed_paths) dataset = dt.ConcatDataset(lazy_dataset_list) return dataset
def __init__(self, videos, batch_size, shuffle=False, num_workers=0): self.videos = videos self.batch_size = batch_size self.shuffle = shuffle self.dataset = data.ConcatDataset(videos) self.data_loader = data.DataLoader(self.dataset, batch_size, shuffle, pin_memory=True, num_workers=num_workers)
def __init__(self, root, domain, partition, num_points=1024, sampling_method='random', download=False, test_cnt=1, transforms=None): if download: self.download(root) self.domain = domain self.num_points = num_points self.partition = partition if partition in ['train', 'trainval']: self.dataset = DOMAIN_DATASET[domain](root=root, partition=partition, seed=None) self.targets = self.dataset.targets elif partition in ['test', 'val']: self.dataset = data.ConcatDataset([ DOMAIN_DATASET[domain](root=root, partition=partition, seed=seed) for seed in range(test_cnt) ]) self.targets = [] for seed in range(test_cnt): self.targets.extend(DOMAIN_DATASET[domain](root=root, partition=partition, seed=seed).targets) else: raise NotImplementedError self.sampling_method = sampling_method self.transforms = transforms
def training(self, train_datasets, **kwargs): n_epochs = kwargs.get('n_epochs', 1) log_freq = kwargs.get('log_freq', 20) mini_batch_size = kwargs.get('mini_batch_size') if self.training_mode == 'sequential': for cluster_idx, train_dataset in enumerate(train_datasets): logger.info('Training on cluster {}'.format(cluster_idx + 1)) train_dataloader = data.DataLoader( train_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.rel_encode) self.train(dataloader=train_dataloader, n_epochs=n_epochs, log_freq=log_freq) elif self.training_mode == 'multi_task': train_dataset = data.ConcatDataset(train_datasets) logger.info('Training multi-task model on all datasets') train_dataloader = data.DataLoader( train_dataset, batch_size=mini_batch_size, shuffle=True, collate_fn=datasets.utils.rel_encode) self.train(dataloader=train_dataloader, n_epochs=n_epochs, log_freq=log_freq)