def _generate_subset(self, x, y): if self.partners_count == 1: return [(x, y)] else: y_str = LabelEncoder().fit_transform([str(label) for label in y]) splitting_indices = (np.cumsum(self.amounts_per_partner)[:-1] * len(y)).astype(int) idxs = y_str.argsort() idx_list = np.split(idxs, splitting_indices) res = [] for slice_idx in idx_list: res.append((x[slice_idx], y[slice_idx])) return res
def split_data(self, is_logging_enabled=True): """Populates the partners with their train and test data (not pre-processed)""" # Fetch parameters of scenario y_train = LabelEncoder().fit_transform([str(y) for y in self.dataset.y_train]) # Configure the desired splitting scenario - Datasets sizes # Should the partners receive an equivalent amount of samples each... # ... or receive different amounts? # Check the percentages of samples per partner and control its coherence assert ( len(self.amounts_per_partner) == self.partners_count ), "Error: in the provided config file, \ amounts_per_partner list should have a size equals to partners_count" assert ( np.sum(self.amounts_per_partner) == 1 ), "Error: in the provided config file, \ amounts_per_partner argument: the sum of the proportions you provided isn't equal to 1" # Then we parameterize this via the splitting_indices to be passed to np.split # This is to transform the percentages from the scenario configuration into indices where to split the data if self.partners_count == 1: splitting_indices_train = 1 else: splitting_indices = np.empty((self.partners_count - 1,)) splitting_indices[0] = self.amounts_per_partner[0] for i in range(self.partners_count - 2): splitting_indices[i + 1] = ( splitting_indices[i] + self.amounts_per_partner[i + 1] ) splitting_indices_train = (splitting_indices * len(y_train)).astype(int) # Configure the desired data distribution scenario # In the 'stratified' scenario we sort by labels if self.samples_split_description == "stratified": # Sort by labels train_idx = y_train.argsort() # In the 'random' scenario we shuffle randomly the indexes elif self.samples_split_description == "random": train_idx = np.arange(len(y_train)) np.random.seed(42) np.random.shuffle(train_idx) # If neither 'stratified' nor 'random', we raise an exception else: raise NameError( "This samples_split option [" + self.samples_split_description + "] is not recognized." ) # Do the partitioning among partners according to desired scenarios # Split data between partners train_idx_idx_list = np.split(train_idx, splitting_indices_train) # Populate partners partner_idx = 0 for train_idx in train_idx_idx_list: p = self.partners_list[partner_idx] # Finalize selection of train data # Populate the partner's train dataset p.x_train = self.dataset.x_train[train_idx, :] p.y_train = self.dataset.y_train[train_idx] # Create local validation and test datasets from the partner train data ( p.x_train, p.x_test, p.y_train, p.y_test, ) = self.dataset.train_test_split_local(p.x_train, p.y_train) p.x_train, p.x_val, p.y_train, p.y_val = self.dataset.train_val_split_local( p.x_train, p.y_train ) # Update other attributes from partner p.final_nb_samples = len(p.x_train) p.clusters_list = list(set(y_train[train_idx])) # Move on to the next partner partner_idx += 1 # Check coherence of number of mini-batches versus smaller partner assert self.minibatch_count <= ( min(self.amounts_per_partner) * len(y_train) ), "Error: in the provided config \ file and dataset, a partner doesn't have enough data samples to create the minibatches" self.nb_samples_used = sum([len(p.x_train) for p in self.partners_list]) self.final_relative_nb_samples = [ p.final_nb_samples / self.nb_samples_used for p in self.partners_list ] if is_logging_enabled: logger.info("### Splitting data among partners:") logger.info(" Simple split performed.") logger.info( f" Nb of samples split amongst partners: {self.nb_samples_used}" ) for partner in self.partners_list: logger.info( f" Partner #{partner.id}: " f"{partner.final_nb_samples} samples " f"with labels {partner.clusters_list}" ) return 0