Пример #1
0
def iterative_train_test_split(X: pd.Series,
                               y: np.ndarray,
                               train_size: float = 0.7) -> Tuple:
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'

    Args:
        X (pd.Series): Input features as a pandas Series object.
        y (np.ndarray): One-hot encoded labels.
        train_size (float, optional): Proportion of data for first split. Defaults to 0.7.

    Returns:
        Two stratified splits based on specified proportions.
    """
    stratifier = IterativeStratification(
        n_splits=2,
        order=1,
        sample_distribution_per_fold=[
            1.0 - train_size,
            train_size,
        ],
    )
    train_indices, test_indices = next(stratifier.split(X, y))
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, X_test, y_train, y_test
Пример #2
0
def portion_split(data, portion, seed=1337, labels=None, label_info=None):
    """Perform a k% split to train-validation instances"""

    msg = f"Portion-splitting with input data: {len(data)} samples on a {portion} validation portion"
    if labels is None:
        info(msg)
        return list(
            ShuffleSplit(n_splits=1, test_size=portion,
                         random_state=seed).split(data))
    else:
        multilabel = label_info.multilabel
        num_labels = len(label_info.label_names)
        if multilabel:
            stratifier = IterativeStratification(
                n_splits=2,
                order=2,
                sample_distribution_per_fold=[portion, 1.0 - portion])
            labels = one_hot(labels, num_labels, True)
            info(msg + " using iterative stratification.")
            train_indexes, test_indexes = next(
                stratifier.split(np.zeros(len(data)), labels))
            return [(train_indexes, test_indexes)]
        else:
            try:
                info(msg + " using stratification.")
                return list(
                    StratifiedShuffleSplit(n_splits=1,
                                           test_size=portion,
                                           random_state=seed).split(
                                               data, labels))
            except ValueError as ve:
                error(f"Unable to complete a stratified split: {ve}")
Пример #3
0
def multilabel_pipeline_cross_val(pipeline,
                                  X,
                                  y,
                                  labels=None,
                                  n_splits=3,
                                  verbose=0):
    """Multi-label pipeline cross-validation

    Parameters
    ----------
    pipeline : `sklearn.pipeline.Pipeline` or custom pipeline
        Must have .fit and .predict methods

    X : array-like

    y : array-like
        (n_samples x n_labels)

    labels : array-like
        Label names (numerical if Default = None)

    n_splits : int
        Number of cross-validation splits (Default = 3)

    Returns
    -------
    mlc : `multilabel.MultiLabelClassification`
        Multi-label classification results

    folds : list
        (train_idx, valid_idx) pair for each CV fold
    """
    kfold = IterativeStratification(n_splits=n_splits,
                                    order=1,
                                    random_state=None)
    pred = np.zeros_like(y, dtype=float)
    thresh_folds = np.zeros((y.shape[1], n_splits))
    for i, (train_idx, valid_idx) in enumerate(kfold.split(X, y)):
        if verbose > 0:
            print(f"\n--------\nFold {i+1}/{kfold.n_splits}")
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        pipeline.fit(X_train, y_train, labels=labels, verbose=verbose)
        valid_pred = pipeline.predict(X_valid)
        pred[valid_idx] = valid_pred
        mlc_valid = MultiLabelClassification(y_valid,
                                             valid_pred,
                                             labels=labels)
        thresh_folds[:, i] = mlc_valid.best_thresholds('gmean')
        if verbose > 0:
            mlc_valid.print_report(full=(verbose > 1))
    threshold = thresh_folds.mean(axis=1)
    mlc = MultiLabelClassification(y,
                                   pred=pred,
                                   labels=labels,
                                   threshold=threshold)
    if verbose > 0:
        print("\n------------------------\nCross-validation results")
        mlc.print_report(full=True)  #(verbose > 1))
    return mlc
def get_test_train_split(xml_files, classes, classes_dict, test_size,
                         test_train_split):

    if test_train_split == 'stratified':

        label_array = np.zeros((len(xml_files), len(classes)))

        for i, file in enumerate(xml_files):
            labels, _, _ = read_content(file)
            sparse_labels = list(map(lambda x: classes_dict[x], labels))
            label_array[i, sparse_labels] = 1

        kf = IterativeStratification(n_splits=int(1 / test_size))
        train, test = next(kf.split(xml_files, label_array))

        train_xml_files = np.array(xml_files)[train].tolist()
        test_xml_files = np.array(xml_files)[test].tolist()

        return train_xml_files, test_xml_files

    elif test_train_split == 'sequential':

        xml_files = sorted(xml_files,
                           key=lambda x: int(''.join(
                               filter(str.isdigit, os.path.basename(x)))))
        split = int(len(xml_files) * (1 - test_size))
        train_xml_files = np.array(xml_files)[:split].tolist()
        test_xml_files = np.array(xml_files)[split:].tolist()

        return train_xml_files, test_xml_files
Пример #5
0
def split_data(X, Y, n_splits=5, output_dir="splits"):
    split = IterativeStratification(
        n_splits=n_splits,
        order=1,
        random_state=0,
    )

    for split_no, (train_idx, test_idx) in enumerate(split.split(X, Y)):
        print("processing fold", split_no + 1, "/", n_splits)

        X_train = X[train_idx]
        X_test = X[test_idx]

        Y_train = Y[train_idx]
        Y_test = Y[test_idx]

        assert not Y_train.all(axis=-1).any()
        assert not (1 - Y_train).all(axis=-1).any()

        assert not Y_test.all(axis=-1).any()
        assert not (1 - Y_test).all(axis=-1).any()

        split_dir = os.path.join(output_dir, "split_{}".format(split_no))
        os.makedirs(split_dir, exist_ok=True)

        write_smiles(X_train, os.path.join(split_dir, "train.smi"))
        write_smiles(X_test, os.path.join(split_dir, "test.smi"))

        sp.save_npz(os.path.join(split_dir, "train.npz"),
                    sp.csr_matrix(Y_train))
        sp.save_npz(os.path.join(split_dir, "test.npz"), sp.csr_matrix(Y_test))
Пример #6
0
def iterative_train_test_split(X, y, test_size, order=2, random_state=None):
    """Iteratively stratified train/test split

    Parameters
    ----------
    test_size : float, [0,1]
        the proportion of the dataset to include in the test split, the rest will be put in the train set

    Returns
    -------
    X_train, y_train, X_test, y_test
        stratified division into train/test split
        :param order:
        :param random_state:
    """

    stratifier = IterativeStratification(
        n_splits=2,
        order=order,
        sample_distribution_per_fold=[test_size, 1.0 - test_size],
        random_state=random_state)
    train_indexes, test_indexes = next(stratifier.split(X, y))

    X_train, y_train = X[train_indexes, :], y[train_indexes, :]
    X_test, y_test = X[test_indexes, :], y[test_indexes, :]

    return X_train, y_train, X_test, y_test
def _multilabel_stratified_kfold_dfs():
    df = get_train_df()
    label_mat = multilabel_binary_representation(df, sparse=True)

    kf = IterativeStratification(random_state=1234)  # k=3
    for train_index, val_index in kf.split(df.index.values, label_mat):
        fold_train_df = df.iloc[train_index]
        fold_val_df = df.iloc[val_index]
        yield fold_train_df, fold_val_df
def stratify_train_test(y_label, n_splits=10, seed=42):
    y_label_bin = MultiLabelBinarizer().fit_transform(y_label)

    k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=seed)
    for train, test in k_fold.split(y_label.index.to_list(), sps.lil_matrix(y_label_bin)):
        print("train", len(train), "test", len(test))
        train_nodes = list(y_label.index[train])
        test_nodes = list(y_label.index[test])
        yield train_nodes, test_nodes
Пример #9
0
def iterative_train_test_split(X, y, test_size):
    stratifier = IterativeStratification(
        n_splits=2,
        order=2,
        sample_distribution_per_fold=[test_size, 1.0 - test_size])
    train_indexes, test_indexes = next(stratifier.split(X, y))
    X_train = X[train_indexes]
    X_test = X[test_indexes]
    return X_train, X_test
def gen_folds(df, img_mat, target_mat, n_folds):
    'Return dataframe with folds column'
    k_fold = IterativeStratification(n_splits=n_folds, order=1)
    splits = k_fold.split(img_mat, target_mat)
    df['fold'] = 0  # Generate folds column
    # Grab fold number and img indexes from splits, adjust fold column accordingly
    for fold, (_, fold_idxs) in enumerate(splits):
        valid_imgs = img_mat[fold_idxs]
        df.loc[df['cell_id'].isin(valid_imgs.reshape(-1)), 'fold'] = fold
    return df
Пример #11
0
def kfold(train_df, targets_df):

    train_df['kfold'] = -1

    train_df = train_df.sample(frac=1).reset_index(drop=True)

    k_fold = IterativeStratification(n_splits=config.KFOLD_NUMBER, order=1)
    for f, (t_, v_) in enumerate(
            k_fold.split(X=train_df, y=targets_df.drop('sig_id', axis=1))):
        train_df.loc[v_, 'kfold'] = f

    return train_df
Пример #12
0
def objective(params):
    # objective fn to be minimized
    global train_path, test_path, label_to_idx_path, K, config_path, trials

    # get stratisfied split
    df = docs_to_sheet(train_path, "tmp.csv", label_to_idx_path)
    df.drop(columns=["text"], inplace=True)
    df.reset_index(inplace=True)

    # hacky way to make use of SkMultiLearn
    X = df.index
    y = df[[col for col in df.columns if col != "index"]].values
    del df

    k_fold = IterativeStratification(n_splits=K, order=1)

    # get docs
    with open(train_path, "rb") as f:
        docs = pickle.load(f)

    scores = []
    tmp_tr_path = "temp_train.pkl"
    tmp_dev_path = "temp_dev.pkl"
    params["train_path"] = tmp_tr_path
    params["dev_path"] = tmp_dev_path
    params["test_path"] = test_path
    set_params(params, config_path)

    for train_idx, dev_idx in k_fold.split(X, y):
        # get split
        train_docs = [docs[i] for i in train_idx]
        dev_docs = [docs[i] for i in dev_idx]
        # save docs in temp location and free memory
        with open(tmp_tr_path, "wb") as f:
            pickle.dump(train_docs, f)

        with open(tmp_dev_path, "wb") as f:
            pickle.dump(dev_docs, f)

        del train_docs, dev_docs
        gc.collect()

        # call main
        r_k, p_k, rp_k, ndcg_k, avg_loss, hamming, emr, f1_micro, f1_macro = train_eval(
            False)
        scores.append(f1_micro)

    # save trials object for safety
    with open("trials_tmp.pkl", "wb") as f:
        pickle.dump(trials, f)

    return {"loss": 1 - np.mean(scores), "status": STATUS_OK}
Пример #13
0
def binary_split(X, Y, split, order=2):
    split = np.array(split / split.sum())
    strat = IterativeStratification(
        order=order,
        n_splits=len(split),
        sample_distribution_per_fold=split.tolist())
    idx1, idx2 = next(strat.split(X, Y))
    ## switch if out of order with split...
    if np.sign(split[0] - split[1]) != np.sign(len(idx1) - len(idx2)):
        idx1, idx2 = idx2, idx1
    set1 = X[idx1, :], Y[idx1, :]
    set2 = X[idx2, :], Y[idx2, :]
    return set1, set2
Пример #14
0
def main():
    img, labels = load_data(dataset="train")

    stratifier = IterativeStratification(n_splits=8, random_state=1769)

    for i, (train_indexes,
            test_indexes) in enumerate(stratifier.split(X=img, y=labels)):
        print(train_indexes)
        print(test_indexes)

        split_filename = os.path.join(DATA_DIR, "KFold_{}".format(i))
        np.savez(file=split_filename,
                 train_indexes=train_indexes,
                 test_indexes=test_indexes)
Пример #15
0
    def _iterative_train_test_split(self, X, y):
        """Splits X and y into train and test sets using stratification.

        Args:
            X: The samples as :class:`list`.
            y: The one hot encoded labels for as :class:`list`.

        Returns:
            The trainings data X_train, y_train and testing data X_test, y_test as :class:`list`.
        """
        stratifier = IterativeStratification(
            n_splits=2, order=2, sample_distribution_per_fold=[0.25, 0.75])
        train_indexes, test_indexes = next(stratifier.split(X, y))
        return X[train_indexes], y[train_indexes, :], X[test_indexes], y[
            test_indexes, :]
Пример #16
0
def get_split(test_size=0.2):
    data = pd.read_csv(LABELS)
    data = data.apply(convert_targets, axis=1)
    k_fold = IterativeStratification(
        n_splits=2,
        order=2,
        sample_distribution_per_fold=[test_size, 1.0 - test_size])
    train_names = list(data['Id'])
    y = data['Target'].values
    y = np.array([ix for ix in y])
    for train_idx, val_idx in k_fold.split(train_names, y):
        train_set = [train_names[i] for i in train_idx]
        val_set = [train_names[i] for i in val_idx]
        break
    return train_set, val_set
    def test_if_positive_evidence_does_not_include_negative_evidence(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations)
        self.assertFalse(rows_used[0])
        self.assertTrue(rows_used[1])
        self.assertTrue(rows_used[2])
        self.assertTrue(rows_used[3])

        for combination, samples in stratifier.desired_samples_per_combination_per_fold.items():
            for desire in samples:
                self.assertEqual(desire, 0)
Пример #18
0
def evaluate_kfold_label_classification(embedding, labels, k=10):
    assert len(labels.shape) == 2

    model = LogisticRegressionCV(n_jobs=-1)
    #model=SVC(gamma='auto')
    if labels.shape[1] == 1:
        print("single label clasification")
        labels = labels.flatten()
        sss = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

    else:
        print("multi-label classification")
        sss = IterativeStratification(n_splits=k, random_state=0, order=2)
        model = OneVsRestClassifier(model)

    f1_micros = []
    f1_macros = []

    i = 1
    for split_train, split_test in sss.split(embedding, labels):
        model.fit(embedding[split_train], labels[split_train])
        predictions = model.predict(embedding[split_test])
        f1_micro = f1_score(labels[split_test], predictions, average="micro")
        f1_macro = f1_score(labels[split_test], predictions, average="macro")
        f1_micros.append(f1_micro)
        f1_macros.append(f1_macro)
        print("Done {}/{} folds".format(i, k))
        i += 1
    return np.mean(f1_micros), np.mean(f1_macros)
Пример #19
0
def stratified_fold_split_for_rare(
        rare_samples,
        n_splits=5,
        interaction_order=1,
        random_state=42,
        least_representative_cols=("question_type_spelling", ),
):
    rare_ordinals = transform_target_columns_to_ordinals(
        rare_samples[list(least_representative_cols)])

    k_fold = IterativeStratification(n_splits=n_splits,
                                     order=interaction_order,
                                     random_state=random_state)
    folds_rare = []
    for _, fold_ids in k_fold.split(rare_ordinals, rare_ordinals):
        folds_rare.append(rare_samples.index.values[fold_ids])
    return folds_rare
    def test_if_variables_are_initialized_correctly(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        self.assertEqual(stratifier.n_samples, 4)
        self.assertEqual(stratifier.n_labels, 2)
        self.assertEqual(len(rows), 4)
        self.assertEqual(len(rows_used), 4)
        self.assertEqual(len(stratifier.percentage_per_fold), 2)
        self.assertEqual(len(stratifier.desired_samples_per_fold), 2)
        self.assertEqual(len(folds), 2)
        self.assertTrue(not any(rows_used.values()))
        self.assertFalse(any(rows_used.values()))
        self.assertEqual(stratifier.order, 1)

        for d in stratifier.percentage_per_fold:
            self.assertEqual(d, 1 / 2.0)

        for d in stratifier.desired_samples_per_fold:
            self.assertEqual(d, y.shape[0] / 2.0)

        self.assertEqual(len(all_combinations), 2)
        self.assertEqual(len(per_row_combinations[0]), 0)
        self.assertEqual(len(per_row_combinations[1]), 1)
        self.assertEqual(len(per_row_combinations[2]), 1)
        self.assertEqual(len(per_row_combinations[3]), 2)

        self.assertEqual(len(samples_with_combination), 2)
        self.assertEqual(
            len(stratifier.desired_samples_per_combination_per_fold), 2)
        for combination, samples in samples_with_combination.items():
            self.assertEqual(len(set(combination)), 1)
            self.assertEqual(len(samples), 2)

        for combination, desirability in stratifier.desired_samples_per_combination_per_fold.items(
        ):
            self.assertEqual(len(set(combination)), 1)
            self.assertEqual(len(desirability), 2)
            for desire in desirability:
                self.assertEqual(desire, 1.0)
Пример #21
0
def load_datasets(path, drop_missing=True, n_tags=72,
                  test_size=0.2, random_state=42):
    """Load and split dataset from raw CiP data.

    Args:
        path: Path to raw CiP dataset
        drop_missing: Drop events with no description or title
        n_tags: Number of top tags to use (ignored)
        test_size: Fraction of events to include in test set
        random_state: Random state for the split

    Returns:
        (events_train, tags_train, events_test, tags_test, top_tags,
            tags_train_stats)
    """
    events_df, tags_df = load_raw_normalized_dataset(path,
                                                     drop_missing=drop_missing)
    top_tags = calculate_top_tags(tags_df, n_tags=n_tags)

    # Only keep top tags
    tags_df = tags_df[tags_df['tag'].isin(top_tags)]

    tag_matrix = tags_to_matrix(events_df, tags_df, top_tags)

    # Split data into public training set and private test set
    stratifier = IterativeStratification(
        n_splits=2, order=2,
        sample_distribution_per_fold=[test_size, 1.0 - test_size],
        random_state=random_state)
    train_indices, test_indices = next(stratifier.split(events_df, tag_matrix))
    events_train, tags_train = events_df.iloc[train_indices], \
                               tag_matrix[train_indices, :]

    events_test, tags_test = events_df.iloc[test_indices], \
                             tag_matrix[test_indices, :]

    tags_train_stats = pd.DataFrame({
        'tag': top_tags,
        'count': tags_train.sum(axis=0)
    }).sort_values('count', ascending=False)

    return (events_train, tags_train, events_test, tags_test, top_tags,
            tags_train_stats)
    def test_if_positive_evidence_does_not_include_negative_evidence(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        stratifier._distribute_positive_evidence(rows_used, folds,
                                                 samples_with_combination,
                                                 per_row_combinations)
        self.assertFalse(rows_used[0])
        self.assertTrue(rows_used[1])
        self.assertTrue(rows_used[2])
        self.assertTrue(rows_used[3])

        for combination, samples in stratifier.desired_samples_per_combination_per_fold.items(
        ):
            for desire in samples:
                self.assertEqual(desire, 0)
Пример #23
0
    def run(self, labels_csv, models_dir):
        dataset_df = pd.read_csv(labels_csv)
        dataset = dataset_df.values.tolist()

        splits = []
        labels = np.array([d[4:] for d in dataset])
        k_fold = IterativeStratification(
            n_splits=self.cv, order=1, random_state=325
        )

        for i, (trainidx, valididx) in enumerate(k_fold.split(dataset, labels)):
            trainset = [dataset[k] for k in trainidx]
            validset = [dataset[k] for k in valididx]
            splits.append([trainset, validset])

            model_dir = self.__model_dir(models_dir, i + 1)
            self.__train(trainset, validset, model_dir)

        return
    def test_if_variables_are_initialized_correctly(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        self.assertEqual(stratifier.n_samples, 4)
        self.assertEqual(stratifier.n_labels, 2)
        self.assertEqual(len(rows), 4)
        self.assertEqual(len(rows_used), 4)
        self.assertEqual(len(stratifier.percentage_per_fold), 2)
        self.assertEqual(len(stratifier.desired_samples_per_fold), 2)
        self.assertEqual(len(folds), 2)
        self.assertTrue(not any(rows_used.values()))
        self.assertFalse(any(rows_used.values()))
        self.assertEqual(stratifier.order, 1)

        for d in stratifier.percentage_per_fold:
            self.assertEqual(d, 1 / 2.0)

        for d in stratifier.desired_samples_per_fold:
            self.assertEqual(d, y.shape[0] / 2.0)

        self.assertEqual(len(all_combinations), 2)
        self.assertEqual(len(per_row_combinations[0]), 0)
        self.assertEqual(len(per_row_combinations[1]), 1)
        self.assertEqual(len(per_row_combinations[2]), 1)
        self.assertEqual(len(per_row_combinations[3]), 2)

        self.assertEqual(len(samples_with_combination), 2)
        self.assertEqual(len(stratifier.desired_samples_per_combination_per_fold), 2)
        for combination, samples in samples_with_combination.items():
            self.assertEqual(len(set(combination)), 1)
            self.assertEqual(len(samples), 2)

        for combination, desirability in stratifier.desired_samples_per_combination_per_fold.items():
            self.assertEqual(len(set(combination)), 1)
            self.assertEqual(len(desirability), 2)
            for desire in desirability:
                self.assertEqual(desire, 1.0)
Пример #25
0
    def run(self, labels2_csv, labels1_csv, models_dir):
        dataset2_df = pd.read_csv(labels2_csv)
        dataset2 = dataset2_df.values.tolist()

        dataset1_df = pd.read_csv(labels1_csv)
        dataset1 = dataset1_df.values.tolist()

        labels2 = np.array([d[5:] for d in dataset2])
        k_fold = IterativeStratification(n_splits=self.cv,
                                         order=1,
                                         random_state=325)

        for i, (trainidx,
                valididx) in enumerate(k_fold.split(dataset2, labels2)):
            trainset = [dataset2[k] for k in trainidx] + dataset1
            validset = [dataset2[k] for k in valididx]

            model_dir = self.__model_dir(models_dir, i + 1)
            self.__train(trainset, validset, model_dir)

        return
Пример #26
0
def stratified_fold_split_for_common(
    common_samples,
    n_splits=5,
    interaction_order=1,
    random_state=42,
    agg_func=pd.Series.mode,
):
    body_encoder = LabelEncoder()
    common_samples["group_id"] = body_encoder.fit_transform(
        common_samples["question_body"].astype(str))

    common_ordinals = transform_target_columns_to_ordinals(common_samples)
    common_ordinals["group_id"] = common_samples["group_id"]

    common_groups = common_ordinals.groupby(["group_id"])
    with Pool(cpu_count()) as pool:
        aggregated_common_ordinals = list(
            tqdm(
                pool.imap(
                    functools.partial(aggregate_ordinals, agg_func=agg_func),
                    common_groups,
                ),
                total=len(common_groups),
                desc="Aggregate ordinals over groups",
            ))
    aggregated_common_ordinals = pd.concat(aggregated_common_ordinals,
                                           axis=1).transpose()
    aggregated_common_ordinals.index.rename("group_id", inplace=True)

    k_fold = IterativeStratification(n_splits=n_splits,
                                     order=interaction_order,
                                     random_state=random_state)
    folds_common = []
    for _, fold_groups in k_fold.split(aggregated_common_ordinals,
                                       aggregated_common_ordinals):
        fold_mask = common_ordinals["group_id"].isin(fold_groups)
        fold_ids = common_ordinals.index.values[fold_mask]
        folds_common.append(fold_ids)

    return folds_common
Пример #27
0
    def split_stratified(self, dataset):

        Y = np.array([sample["Y"] for sample in dataset])
        dataset = np.array(dataset)

        percentage = self.args["split"]["percentage"]
        stratifier = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[percentage, 1.0 - percentage])
        remaining_idx, test_idx = next(stratifier.split(dataset, Y))

        X_test = dataset[test_idx]
        dataset = dataset[remaining_idx]
        Y = Y[remaining_idx]

        percentage = percentage / (1.0 - percentage)
        stratifier = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[percentage, 1.0 - percentage])
        train_idx, dev_idx = next(stratifier.split(dataset, Y))

        X_train = dataset[train_idx]
        X_dev = dataset[dev_idx]

        return list(X_train), list(X_dev), list(X_test)
Пример #28
0
def split(dataset_path, test_size, stratification):
    df = get_csv(dataset_path, name="train")
    img_ids = df["image_id"]

    if stratification == "sklearn":
        train_set, valid_set = train_test_split(df[KEYS],
                                                test_size=test_size,
                                                random_state=SEED,
                                                shuffle=True)
    elif stratification == "sklearn_stratified":

        df['subset'] = np.nan
        splitter = StratifiedShuffleSplit(n_splits=1,
                                          test_size=test_size,
                                          random_state=SEED)

        train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS]))
        train_set = df.loc[df.index.intersection(train_indcs)].copy()
        valid_set = df.loc[df.index.intersection(valid_indcs)].copy()

        df.iloc[train_indcs, -1] = 'train'
        df.iloc[valid_indcs, -1] = 'valid'

        df.to_csv(os.path.join(dataset_path, 'train_stratified.csv'),
                  index=None)

    elif stratification == "iterstrat":

        splitter = MultilabelStratifiedShuffleSplit(n_splits=1,
                                                    test_size=test_size,
                                                    random_state=SEED)

        train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS]))
        train_set = df.loc[df.index.intersection(train_indcs)].copy()
        valid_set = df.loc[df.index.intersection(valid_indcs)].copy()

    elif stratification == "skmultilearn":

        splitter = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[test_size, 1.0 - test_size])

        train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS]))
        train_set = df.loc[df.index.intersection(train_indcs)].copy()
        valid_set = df.loc[df.index.intersection(valid_indcs)].copy()

    else:
        raise ValueError("Try something else :)")

    return train_set, valid_set
def crossfold(n_rounds,n_splits,classifier,x,y):
    perf = []
    x_columns = x.columns
    y_columns = y.columns
    for i in range(n_rounds):
        print("Round: ",i)
        
        folds = IterativeStratification(n_splits=5, order=1)
        
        for train_index, test_index in folds.split(x, y):
            x = np.array(x)
            y = np.array(y)
            #print("TRAIN:", train_index, "TEST:", test_index)
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            x_train = pd.DataFrame(x_train)
            x_train.columns = x_columns
            x_test = pd.DataFrame(x_test)
            x_test.columns = x_columns
            
            y_train = pd.DataFrame(y_train)
            y_train.columns = y_columns
            y_test = pd.DataFrame(y_test)
            y_test.columns = y_columns
                
            if standardize == 1:
                scaler = StandardScaler()
                scaler.fit(x_train)
                x_train = scaler.transform(x_train)
                x_test = scaler.transform(x_test)
            print("Modelling")  
            model = get_model(classifier,x_train,y_train)
            print("Prediction")
            predictions = get_predictions(model,x_test)
            fold_perf = performance_evaluation(y_test,predictions)
            perf.append(fold_perf)
    return(perf)
Пример #30
0
def kfold_split(data, num_folds, seed, labels=None, label_info=None):
    """Do K-fold cross-validation"""
    num_data = len(data)
    msg = f"Splitting {num_data} input data to {num_folds} folds"
    if labels is None:
        info(msg)
        return list(
            KFold(num_folds, shuffle=True, random_state=seed).split(data))
    else:
        multilabel = label_info.multilabel
        num_labels = len(label_info.label_names)
        if multilabel:
            info(msg + " using iterative stratification.")
            splitter = IterativeStratification(num_folds, order=1)
            oh_labels = one_hot(labels, num_labels, is_multilabel=True)
            return list(splitter.split(np.zeros(len(labels)), oh_labels))
        else:
            try:
                info(msg + " using stratification.")
                return list(
                    StratifiedKFold(num_folds, shuffle=True,
                                    random_state=seed).split(data, labels))
            except ValueError as ve:
                error(f"Unable to complete a stratified fold split: {ve}")
    def test_if_negative_evidence_is_distributed(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        stratifier._distribute_positive_evidence(rows_used, folds,
                                                 samples_with_combination,
                                                 per_row_combinations)
        self.assertFalse(rows_used[0])

        stratifier._distribute_negative_evidence(rows_used, folds)
        self.assertTrue(rows_used[0])
Пример #32
0
def evaluate_kfold_label_classification(
	embedding, 
	labels, 
	k=10):
	assert len(labels.shape) == 2
	
	# model = LogisticRegressionCV(
	# 	max_iter=1000, 
	# 	n_jobs=-1)
	model = SVC(probability=True)

	if labels.shape[1] == 1:
		print ("single label clasification")
		labels = labels.flatten()
		sss = StratifiedKFold(n_splits=k, 
			shuffle=True, 
			random_state=0)

	else:
		print ("multi-label classification")
		sss = IterativeStratification(n_splits=k, 
			order=1)
		model = OneVsRestClassifier(model, )
			
	k_fold_rocs = np.zeros(k)
	k_fold_f1s = np.zeros(k)
	k_fold_precisions = np.zeros(k)
	k_fold_recalls = np.zeros(k)

	for i, (split_train, split_test) in enumerate(\
		sss.split(embedding, labels, )):
		print ("Fold", i+1, "fitting model")
		model.fit(embedding[split_train], labels[split_train])	
		probs = model.predict_proba(embedding[split_test])

		(k_fold_rocs[i], 
			k_fold_f1s[i], 
			k_fold_precisions[i], 
			k_fold_recalls[i]) = compute_measures(
				labels[split_test],
				probs,)

		print ("Completed {}/{} folds".format(i+1, k))

	return (np.mean(k_fold_rocs), np.mean(k_fold_f1s),
		np.mean(k_fold_precisions), np.mean(k_fold_recalls))
    def test_if_negative_evidence_is_distributed(self):
        stratifier = IterativeStratification(n_splits=2, order=1)
        y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])

        rows, rows_used, all_combinations, per_row_combinations, samples_with_combination, folds = \
            stratifier._prepare_stratification(y)

        stratifier._distribute_positive_evidence(rows_used, folds, samples_with_combination, per_row_combinations)
        self.assertFalse(rows_used[0])

        stratifier._distribute_negative_evidence(rows_used, folds)
        self.assertTrue(rows_used[0])
 def test_if_stratification_works(self):
     stratifier = IterativeStratification(n_splits=2, order=1)
     X = np.matrix([[0], [1], [2], [3]])
     y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])
     self.assertEqual(len(list(stratifier.split(X, y))), 2)