예제 #1
0
 def _generate_subset(self, x, y):
     if self.partners_count == 1:
         return [(x, y)]
     else:
         y_str = LabelEncoder().fit_transform([str(label) for label in y])
         splitting_indices = (np.cumsum(self.amounts_per_partner)[:-1] *
                              len(y)).astype(int)
         idxs = y_str.argsort()
         idx_list = np.split(idxs, splitting_indices)
         res = []
         for slice_idx in idx_list:
             res.append((x[slice_idx], y[slice_idx]))
         return res
    def split_data(self, is_logging_enabled=True):
        """Populates the partners with their train and test data (not pre-processed)"""

        # Fetch parameters of scenario

        y_train = LabelEncoder().fit_transform([str(y) for y in self.dataset.y_train])

        # Configure the desired splitting scenario - Datasets sizes
        # Should the partners receive an equivalent amount of samples each...
        # ... or receive different amounts?

        # Check the percentages of samples per partner and control its coherence
        assert (
                len(self.amounts_per_partner) == self.partners_count
        ), "Error: in the provided config file, \
            amounts_per_partner list should have a size equals to partners_count"
        assert (
                np.sum(self.amounts_per_partner) == 1
        ), "Error: in the provided config file, \
            amounts_per_partner argument: the sum of the proportions you provided isn't equal to 1"

        # Then we parameterize this via the splitting_indices to be passed to np.split
        # This is to transform the percentages from the scenario configuration into indices where to split the data
        if self.partners_count == 1:
            splitting_indices_train = 1
        else:
            splitting_indices = np.empty((self.partners_count - 1,))
            splitting_indices[0] = self.amounts_per_partner[0]
            for i in range(self.partners_count - 2):
                splitting_indices[i + 1] = (
                        splitting_indices[i] + self.amounts_per_partner[i + 1]
                )
            splitting_indices_train = (splitting_indices * len(y_train)).astype(int)

        # Configure the desired data distribution scenario
        # In the 'stratified' scenario we sort by labels
        if self.samples_split_description == "stratified":
            # Sort by labels
            train_idx = y_train.argsort()

        # In the 'random' scenario we shuffle randomly the indexes
        elif self.samples_split_description == "random":
            train_idx = np.arange(len(y_train))
            np.random.seed(42)
            np.random.shuffle(train_idx)

        # If neither 'stratified' nor 'random', we raise an exception
        else:
            raise NameError(
                "This samples_split option ["
                + self.samples_split_description
                + "] is not recognized."
            )

        # Do the partitioning among partners according to desired scenarios
        # Split data between partners
        train_idx_idx_list = np.split(train_idx, splitting_indices_train)

        # Populate partners
        partner_idx = 0
        for train_idx in train_idx_idx_list:
            p = self.partners_list[partner_idx]

            # Finalize selection of train data
            # Populate the partner's train dataset
            p.x_train = self.dataset.x_train[train_idx, :]
            p.y_train = self.dataset.y_train[train_idx]

            # Create local validation and test datasets from the partner train data
            (
                p.x_train,
                p.x_test,
                p.y_train,
                p.y_test,
            ) = self.dataset.train_test_split_local(p.x_train, p.y_train)
            p.x_train, p.x_val, p.y_train, p.y_val = self.dataset.train_val_split_local(
                p.x_train, p.y_train
            )

            # Update other attributes from partner
            p.final_nb_samples = len(p.x_train)
            p.clusters_list = list(set(y_train[train_idx]))

            # Move on to the next partner
            partner_idx += 1

        # Check coherence of number of mini-batches versus smaller partner
        assert self.minibatch_count <= (
                min(self.amounts_per_partner) * len(y_train)
        ), "Error: in the provided config \
            file and dataset, a partner doesn't have enough data samples to create the minibatches"

        self.nb_samples_used = sum([len(p.x_train) for p in self.partners_list])
        self.final_relative_nb_samples = [
            p.final_nb_samples / self.nb_samples_used for p in self.partners_list
        ]

        if is_logging_enabled:
            logger.info("### Splitting data among partners:")
            logger.info("   Simple split performed.")
            logger.info(
                f"   Nb of samples split amongst partners: {self.nb_samples_used}"
            )
            for partner in self.partners_list:
                logger.info(
                    f"   Partner #{partner.id}: "
                    f"{partner.final_nb_samples} samples "
                    f"with labels {partner.clusters_list}"
                )

        return 0