示例#1
0
def stratified_kfold_cross_validation(X, y, n_splits=5):
    """Split dataset into stratified cross validation folds.
    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.
 
    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.
    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """

    X_train_folds = []
    X_test_folds = [[] for _ in range(n_splits)]

    _, group_subtables = myutils.group_by(X, y)

    iterator = 0
    for ii in range(len(group_subtables)):
        for jj, item in enumerate(group_subtables[ii]):
            X_test_folds[iterator % n_splits].append(X.index(item))
            iterator += 1

    for jj in range(n_splits):
        X_train_folds.append(
            [item for item in range(len(X)) if item not in X_test_folds[jj]])

    return X_train_folds, X_test_folds
示例#2
0
def stratified_kfold_cross_validation(X,
                                      y,
                                      n_splits=5,
                                      random_state=None,
                                      shuffle=False):
    """Split dataset into stratified cross validation folds.

    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.

    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.

    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """
    total_folds = [[] for _ in range(n_splits)]
    X_train_folds = [[] for _ in range(n_splits)]
    X_test_folds = [[] for _ in range(n_splits)]

    # Group axes with group_by
    groupedList = myutils.group_by(X, y)
    # Create a pointer
    curr = 0
    # Iterate through outter list
    for group in groupedList:
        # Iterate through inner list (within individual elements in grouped list)
        for i in group:
            # Set pointer to current state + 1 mod n_splits
            curr = (curr + 1) % n_splits
            total_folds[curr].append(i)

    # New pointer
    curr = 0
    for j in range(n_splits):
        # Enumerate through fold list to get index positions
        for i, fold in enumerate(total_folds):
            if (i != j):
                for val in fold:
                    X_train_folds[curr].append(val)
            else:
                X_test_folds[curr] = fold

        curr += 1

    return X_train_folds, X_test_folds
示例#3
0
    def fit(self, X_train, y_train):
        """Fits a Naive Bayes classifier to X_train and y_train.
        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        Notes:
            Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities
                and the posterior probabilities for the training data.
            You are free to choose the most appropriate data structures for storing the priors
                and posteriors.
        """
        self.X_train = X_train
        self.y_train = y_train

        classNames, classTables = myutils.group_by(X_train, y_train)

        [
            self.classes.append(item) for item in y_train
            if item not in self.classes
        ]
        for x in self.classes:
            self.priors.append(y_train.count(x) / len(y_train))
            self.posteriors[x] = {}

        for ii in range(len(X_train)):
            for jj in range(len(X_train[0])):
                label = "att" + str(jj) + "=" + str(X_train[ii][jj])
                for aClass in self.classes:
                    self.posteriors[aClass][label] = 0

        for ii, aClass in enumerate(classNames):
            for jj in range(len(classTables[ii][0])):
                for kk in range(len(classTables[ii])):
                    label = "att" + str(jj) + "=" + str(
                        classTables[ii][kk][jj])
                    if label in self.posteriors[aClass].keys():
                        self.posteriors[aClass][label] += 1
                    else:
                        print("Not Allowed")
        for key, value in self.posteriors.items():
            for akey in value:
                value[akey] /= len(classTables[classNames.index(key)])
        pass
示例#4
0
def stratified_kfold_cross_validation(X, y, n_splits=5):
    """Split dataset into stratified cross validation folds.

    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.
 
    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.

    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """
    total_folds = [[] for _ in range(n_splits)]
    X_train_folds = [[] for _ in range(n_splits)]
    X_test_folds = [[] for _ in range(n_splits)]

    grouped = myutils.group_by(X, y)
    curr = 0

    # get the stratified index sets
    for group in grouped:
        for i in group:
            total_folds[curr].append(i)
            curr = (curr + 1) % n_splits

    curr = 0
    for j in range(n_splits):
        for i, fold in enumerate(total_folds):
            if (i != j):
                for val in fold:
                    X_train_folds[curr].append(val)
            else:
                X_test_folds[curr] = fold
        curr += 1

    return X_train_folds, X_test_folds
def stratified_kfold_cross_validation(X, y, n_splits=5):
    """Split dataset into stratified cross validation folds.
    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.
 
    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.
    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """
    # first group by y lables

    _, group_subtables = myutils.group_by(X, y)

    X_train_folds = []
    X_test_folds = []
    for i in range(n_splits):
        X_train_folds.append([])
        X_test_folds.append([])

    # split data into bins
    index = 0
    for row in group_subtables:
        for item in row:
            X_test_folds[index].append(item)
            index = (index + 1) % n_splits

    # combine bins into train sets
    for i in range(len(X_test_folds)):
        for j in range(len(X_test_folds)):
            if j != i:
                X_train_folds[i].extend(X_test_folds[j])

    return X_train_folds, X_test_folds
 def fit(self, X_train, y_train):
     """Fits a Naive Bayes classifier to X_train and y_train.
     Args:
         X_train(list of list of obj): The list of training instances (samples). 
             The shape of X_train is (n_train_samples, n_features)
         y_train(list of obj): The target y values (parallel to X_train)
             The shape of y_train is n_train_samples
     Notes:
         Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities
             and the posterior probabilities for the training data.
         You are free to choose the most appropriate data structures for storing the priors
             and posteriors.
     """
     self.X_train = X_train
     self.y_train = y_train
     self.priors = []
     self.posteriors = []
     header = []
     X_train_copy = X_train.copy()
     for i in range(len(X_train_copy)):
         X_train_copy[i].append(y_train[i])
     for i in range(len(X_train_copy[0])):
         header.append(str(i + 1))
     classifier_names, classifier_subtables = myutils.group_by(
         X_train_copy, header, str(len(X_train_copy[0])))
     self.priors.append(classifier_names)
     for subtable in classifier_subtables:
         self.priors.append(len(subtable) / len(X_train_copy))
     posteriors_header = []
     #print(self.priors)
     for i in range(len(X_train_copy[i]) - 1, 0, -1):
         temp_names, temp_subtables = myutils.group_by(
             X_train_copy, header, str(i))
         for name in temp_names:
             posteriors_header.append(name)
     posteriors_row = []
     for i in range(len(posteriors_header) + 1):
         for j in range(len(classifier_names) + 1):
             posteriors_row.append(0)
         self.posteriors.append(posteriors_row)
         posteriors_row = []
     self.posteriors[0][0] = "label"
     for i in range(len(self.posteriors[0]) - 1):
         self.posteriors[0][i + 1] = classifier_names[i]
     for i in range(1, len(self.posteriors)):
         self.posteriors[i][0] = str(posteriors_header[i - 1])
     for k in range(len(classifier_subtables)):
         header_col = myutils.get_column(self.posteriors,
                                         self.posteriors[0],
                                         self.posteriors[0][0])
         for i in range(len(header) - 1):
             col = myutils.get_column(classifier_subtables[k], header,
                                      header[i])
             values, counts = myutils.get_frequencies(col)
             for j in range(len(counts)):
                 row_index = header_col.index(str(values[j]))
                 header_col[row_index] = 0
                 col_index = self.posteriors[0].index(classifier_names[k])
                 self.posteriors[row_index][col_index] = counts[j] / len(
                     classifier_subtables[k])
     pass  # TODO: copy your solution from PA5 here
示例#7
0
def stratified_kfold_cross_validation(X, y, n_splits=5):
    """Split dataset into stratified cross validation folds.

    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.
 
    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.

    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """
    X_train_folds = []
    X_test_folds = []
    header = ["index", "class label"]

    # append class label (y val) to each instance in X
    # X_copy = copy.deepcopy(X)
    X_indices = []
    for index, instance in enumerate(X):
        X_indices.append([index, y[index]])

    # create mypytable obj
    table = mypytable.MyPyTable(header, X_indices)

    # partition samples by class label
    group_names, group_subtables = myutils.group_by(table.data,
                                                    table.column_names,
                                                    "class label")

    # remove class labels from group subtables
    for group in group_subtables:
        for instance in group:
            instance.pop()

    folds = []
    for _ in range(n_splits):
        folds.append([])

    # for each group, distribute the instances one at a time to a fold
    loop_num = 0
    for group in group_subtables:
        for value in group:
            folds[loop_num % n_splits].append(value[0])
            loop_num += 1

    for fold in folds:
        # test on fold
        X_test_folds.append(fold)
        # train on remaining folds (folds - fold)
        remaining_fold_indices = []
        for new_fold in folds:
            if new_fold != fold:
                for val in new_fold:
                    remaining_fold_indices.append(val)
        X_train_folds.append(remaining_fold_indices)

    return X_train_folds, X_test_folds
def stratified_kfold_cross_validation(X, y, n_splits=5):
    """Split dataset into stratified cross validation folds.
    Args:
        X(list of list of obj): The list of instances (samples). 
            The shape of X is (n_samples, n_features)
        y(list of obj): The target y values (parallel to X). 
            The shape of y is n_samples
        n_splits(int): Number of folds.
 
    Returns:
        X_train_folds(list of list of int): The list of training set indices for each fold.
        X_test_folds(list of list of int): The list of testing set indices for each fold.
    Notes: 
        Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
    """
    X_train_folds = []
    X_test_folds = []

    # split X into folds
    header = ["X sample", "classifier"]
    table = []
    i = 0
    for sample in X:
        row = []
        row.append(sample)
        row.append(y[i])
        i += 1
        table.append(row)
    classifiers, classifier_tables = myutils.group_by(table, header,
                                                      "classifier")
    fold_indices = []
    fold_size_1 = (len(X) // n_splits) + 1
    group_1_num = len(X) % n_splits
    fold_size_2 = len(X) // n_splits
    indices = []
    for i in range(len(classifier_tables[0])):
        for j in range(len(classifier_tables)):
            if i < len(classifier_tables[j]):
                indices.append(X.index(classifier_tables[j][i][0]))
    k = 0
    for i in range(group_1_num):
        temp = []
        for j in range(fold_size_1):
            if k < len(indices):
                temp.append(indices[k])
            k += 1
        fold_indices.append(temp)
    for i in range(group_1_num, n_splits):
        temp = []
        for j in range(fold_size_2):
            if k < len(indices):
                temp.append(indices[k])
            k += 1
        fold_indices.append(temp)
    for j in range(n_splits):
        test_temp = fold_indices[j]
        X_test_folds.append(test_temp)
        k = 0
        test_train = []
        while k < len(fold_indices):
            if (k == j):
                k += 1
            if (k == len(fold_indices)):  # j was last index
                break
            test_train.extend(fold_indices[k])
            k += 1
        X_train_folds.append(test_train)
    return X_train_folds, X_test_folds  # TODO: fix this