def __iter__(self): samplers_list = [] sampler_iterators = [] datasets_length = [] for dataset_idx in range(self.number_of_datasets): cur_dataset = self.dataset.datasets[dataset_idx] print("The length for dataset + " + str(cur_dataset.batch_layers) + " is " + str(len(cur_dataset.data))) sampler = RandomSampler(cur_dataset) samplers_list.append(sampler) cur_sampler_iterator = sampler.__iter__() sampler_iterators.append(cur_sampler_iterator) datasets_length.append(len(cur_dataset)) push_index_val = [0] + self.dataset.cumulative_sizes[:-1] step = self.batch_size # * self.number_of_datasets samples_to_grab = self.batch_size largest_dataset_index = torch.argmax( torch.as_tensor(datasets_length)).item() # for this case we want to get all samples in dataset, this force us to resample from the smaller datasets # epoch_samples = datasets_length[largest_dataset_index] * self.number_of_datasets # iterate over the total length of the combined datasets (slightly oversampling some datasets, and undersampling others) epoch_samples = self.total_length final_samples_list = [ ] # this is a list of indexes from the combined dataset for _ in range(0, epoch_samples, step): #TODO instead of alternating between datasets, flip a (weighted) coin every time coin_toss = torch.rand(1).item() for threshold in self.partitions: if threshold > coin_toss: i = self.partitions.index(threshold) break # for i in range(self.number_of_datasets): cur_batch_sampler = sampler_iterators[i] cur_samples = [] for _ in range(samples_to_grab): try: cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) except StopIteration: if i == largest_dataset_index: # largest dataset iterator is done we can break samples_to_grab = len( cur_samples) # adjusting the samples_to_grab # got to the end of iterator - extend final list and continue to next task if possible break else: # restart the iterator - we want more samples until finishing with the largest dataset sampler_iterators[i] = samplers_list[i].__iter__() cur_batch_sampler = sampler_iterators[i] cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) final_samples_list.extend(cur_samples) return iter(final_samples_list)
def __iter__(self): samplers_list = [] sampler_iterators = [] datasets_length = [] for dataset_idx in range(self.number_of_datasets): cur_dataset = self.dataset.datasets[dataset_idx] sampler = RandomSampler(cur_dataset) samplers_list.append(sampler) cur_sampler_iterator = sampler.__iter__() sampler_iterators.append(cur_sampler_iterator) datasets_length.append(len(cur_dataset)) push_index_val = [0] + self.dataset.cumulative_sizes[:-1] step = self.batch_size * self.number_of_datasets samples_to_grab = self.batch_size largest_dataset_index = torch.argmax( torch.as_tensor(datasets_length)).item() # for this case we want to get all samples in dataset, this force us to resample from the smaller datasets epoch_samples = datasets_length[ largest_dataset_index] * self.number_of_datasets final_samples_list = [ ] # this is a list of indexes from the combined dataset for _ in range(0, epoch_samples, step): for i in range(self.number_of_datasets): cur_batch_sampler = sampler_iterators[i] cur_samples = [] if i == 0: samples_to_grab *= 2 else: samples_to_grab = self.batch_size for _ in range(samples_to_grab): try: cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) except StopIteration: if i == largest_dataset_index: # largest dataset iterator is done we can break samples_to_grab = len( cur_samples) # adjusting the samples_to_grab # got to the end of iterator - extend final list and continue to next task if possible break else: # restart the iterator - we want more samples until finishing with the largest dataset sampler_iterators[i] = samplers_list[i].__iter__() cur_batch_sampler = sampler_iterators[i] cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) final_samples_list.extend(cur_samples) return iter(final_samples_list)
def __iter__(self): samplers_list = [] sampler_iterators = [] for dataset_idx in range(self.number_of_datasets): cur_dataset = self.dataset.datasets[dataset_idx] sampler = RandomSampler(cur_dataset) samplers_list.append(sampler) cur_sampler_iterator = sampler.__iter__() sampler_iterators.append(cur_sampler_iterator) push_index_val = [0] + self.dataset.cumulative_sizes[:-1] step = self.batch_size * self.number_of_datasets samples_to_grab = self.batch_size # for this case we want to get all samples in dataset, this force us to resample from the smaller datasets epoch_samples = self.largest_dataset_size * self.number_of_datasets final_samples_list = [ ] # this is a list of indexes from the combined dataset for _ in range(0, epoch_samples, step): for i in range(self.number_of_datasets): cur_batch_sampler = sampler_iterators[i] cur_samples = [] for _ in range(samples_to_grab): try: cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) except StopIteration: # got to the end of iterator - restart the iterator and continue to get samples # until reaching "epoch_samples" sampler_iterators[i] = samplers_list[i].__iter__() cur_batch_sampler = sampler_iterators[i] cur_sample_org = cur_batch_sampler.__next__() cur_sample = cur_sample_org + push_index_val[i] cur_samples.append(cur_sample) final_samples_list.extend(cur_samples) return iter(final_samples_list)