Exemplo n.º 1
0
 def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
     return DatasetSplits.from_proportions(
         df=dataset_df,
         proportion_train=0.7,
         proportion_test=0.2,
         proportion_val=0.1,
     )
 def get_model_train_test_dataset_splits(
         self, dataset_df: pd.DataFrame) -> DatasetSplits:
     return DatasetSplits.from_proportions(dataset_df,
                                           proportion_train=0.8,
                                           proportion_val=0.05,
                                           proportion_test=0.15,
                                           random_seed=0)
Exemplo n.º 3
0
    def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
        if self.test_set_ids_csv:
            test_set_ids_csv = self.local_dataset / self.test_set_ids_csv
            test_series = pd.read_csv(test_set_ids_csv).series

            all_series = dataset_df.series.values
            check_all_test_series = all(test_series.isin(all_series))
            if not check_all_test_series:
                raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.")

            test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values
            train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values
            random.seed(42)
            random.shuffle(train_and_val_series)
            num_val_samples = math.floor(len(train_and_val_series) / 9)
            val_series = train_and_val_series[:num_val_samples]
            train_series = train_and_val_series[num_val_samples:]

            logging.info(f"Dropped {len(all_series) - (len(test_series) + len(train_and_val_series))} series "
                         f"due to subject overlap with test set.")
            return DatasetSplits.from_subject_ids(dataset_df,
                                                  train_ids=train_series,
                                                  val_ids=val_series,
                                                  test_ids=test_series,
                                                  subject_column="series",
                                                  group_column="subject")
        else:
            return DatasetSplits.from_proportions(dataset_df,
                                                  proportion_train=0.8,
                                                  proportion_val=0.1,
                                                  proportion_test=0.1,
                                                  subject_column="series",
                                                  group_column="subject",
                                                  shuffle=True)
 def get_model_train_test_dataset_splits(
         self, dataset_df: pd.DataFrame) -> DatasetSplits:
     return DatasetSplits.from_proportions(
         df=dataset_df,
         proportion_train=0.7,
         proportion_test=0.2,
         proportion_val=0.1,
         random_seed=1,
         subject_column=self.subject_column)
Exemplo n.º 5
0
def test_grouped_splits(group_column: str) -> None:
    test_df = _get_test_df()[0]
    proportions = [0.5, 0.4, 0.1]
    splits = DatasetSplits.from_proportions(test_df,
                                            proportions[0],
                                            proportions[1],
                                            proportions[2],
                                            group_column=group_column)
    _check_is_partition(test_df, [splits.train, splits.test, splits.val],
                        CSV_SUBJECT_HEADER)
    _check_is_partition(test_df, [splits.train, splits.test, splits.val],
                        group_column)
Exemplo n.º 6
0
def test_grouped_k_fold_cross_validation_splits(group_column: str) -> None:
    test_df = _get_test_df()[0]
    proportions = [0.5, 0.4, 0.1]
    splits = DatasetSplits.from_proportions(test_df,
                                            proportions[0],
                                            proportions[1],
                                            proportions[2],
                                            group_column=group_column)

    n_splits = 7  # mutually prime with numbers of subjects and groups
    val_folds = []
    for fold in splits.get_k_fold_cross_validation_splits(n_splits):
        _check_is_partition(test_df, [fold.train, fold.test, fold.val],
                            CSV_SUBJECT_HEADER)
        _check_is_partition(test_df, [fold.train, fold.test, fold.val],
                            group_column)
        assert fold.test.equals(splits.test)
        val_folds.append(fold.val)

    # ensure validation folds partition the original train+val set
    train_val = pd.concat([splits.train, splits.val])
    _check_is_partition(train_val, val_folds, CSV_SUBJECT_HEADER)
    _check_is_partition(train_val, val_folds, group_column)