def stratified_kfold_cross_validation(X, y, n_splits=5): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ X_train_folds = [] X_test_folds = [[] for _ in range(n_splits)] _, group_subtables = myutils.group_by(X, y) iterator = 0 for ii in range(len(group_subtables)): for jj, item in enumerate(group_subtables[ii]): X_test_folds[iterator % n_splits].append(X.index(item)) iterator += 1 for jj in range(n_splits): X_train_folds.append( [item for item in range(len(X)) if item not in X_test_folds[jj]]) return X_train_folds, X_test_folds
def stratified_kfold_cross_validation(X, y, n_splits=5, random_state=None, shuffle=False): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ total_folds = [[] for _ in range(n_splits)] X_train_folds = [[] for _ in range(n_splits)] X_test_folds = [[] for _ in range(n_splits)] # Group axes with group_by groupedList = myutils.group_by(X, y) # Create a pointer curr = 0 # Iterate through outter list for group in groupedList: # Iterate through inner list (within individual elements in grouped list) for i in group: # Set pointer to current state + 1 mod n_splits curr = (curr + 1) % n_splits total_folds[curr].append(i) # New pointer curr = 0 for j in range(n_splits): # Enumerate through fold list to get index positions for i, fold in enumerate(total_folds): if (i != j): for val in fold: X_train_folds[curr].append(val) else: X_test_folds[curr] = fold curr += 1 return X_train_folds, X_test_folds
def fit(self, X_train, y_train): """Fits a Naive Bayes classifier to X_train and y_train. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities and the posterior probabilities for the training data. You are free to choose the most appropriate data structures for storing the priors and posteriors. """ self.X_train = X_train self.y_train = y_train classNames, classTables = myutils.group_by(X_train, y_train) [ self.classes.append(item) for item in y_train if item not in self.classes ] for x in self.classes: self.priors.append(y_train.count(x) / len(y_train)) self.posteriors[x] = {} for ii in range(len(X_train)): for jj in range(len(X_train[0])): label = "att" + str(jj) + "=" + str(X_train[ii][jj]) for aClass in self.classes: self.posteriors[aClass][label] = 0 for ii, aClass in enumerate(classNames): for jj in range(len(classTables[ii][0])): for kk in range(len(classTables[ii])): label = "att" + str(jj) + "=" + str( classTables[ii][kk][jj]) if label in self.posteriors[aClass].keys(): self.posteriors[aClass][label] += 1 else: print("Not Allowed") for key, value in self.posteriors.items(): for akey in value: value[akey] /= len(classTables[classNames.index(key)]) pass
def stratified_kfold_cross_validation(X, y, n_splits=5): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ total_folds = [[] for _ in range(n_splits)] X_train_folds = [[] for _ in range(n_splits)] X_test_folds = [[] for _ in range(n_splits)] grouped = myutils.group_by(X, y) curr = 0 # get the stratified index sets for group in grouped: for i in group: total_folds[curr].append(i) curr = (curr + 1) % n_splits curr = 0 for j in range(n_splits): for i, fold in enumerate(total_folds): if (i != j): for val in fold: X_train_folds[curr].append(val) else: X_test_folds[curr] = fold curr += 1 return X_train_folds, X_test_folds
def stratified_kfold_cross_validation(X, y, n_splits=5): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ # first group by y lables _, group_subtables = myutils.group_by(X, y) X_train_folds = [] X_test_folds = [] for i in range(n_splits): X_train_folds.append([]) X_test_folds.append([]) # split data into bins index = 0 for row in group_subtables: for item in row: X_test_folds[index].append(item) index = (index + 1) % n_splits # combine bins into train sets for i in range(len(X_test_folds)): for j in range(len(X_test_folds)): if j != i: X_train_folds[i].extend(X_test_folds[j]) return X_train_folds, X_test_folds
def fit(self, X_train, y_train): """Fits a Naive Bayes classifier to X_train and y_train. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities and the posterior probabilities for the training data. You are free to choose the most appropriate data structures for storing the priors and posteriors. """ self.X_train = X_train self.y_train = y_train self.priors = [] self.posteriors = [] header = [] X_train_copy = X_train.copy() for i in range(len(X_train_copy)): X_train_copy[i].append(y_train[i]) for i in range(len(X_train_copy[0])): header.append(str(i + 1)) classifier_names, classifier_subtables = myutils.group_by( X_train_copy, header, str(len(X_train_copy[0]))) self.priors.append(classifier_names) for subtable in classifier_subtables: self.priors.append(len(subtable) / len(X_train_copy)) posteriors_header = [] #print(self.priors) for i in range(len(X_train_copy[i]) - 1, 0, -1): temp_names, temp_subtables = myutils.group_by( X_train_copy, header, str(i)) for name in temp_names: posteriors_header.append(name) posteriors_row = [] for i in range(len(posteriors_header) + 1): for j in range(len(classifier_names) + 1): posteriors_row.append(0) self.posteriors.append(posteriors_row) posteriors_row = [] self.posteriors[0][0] = "label" for i in range(len(self.posteriors[0]) - 1): self.posteriors[0][i + 1] = classifier_names[i] for i in range(1, len(self.posteriors)): self.posteriors[i][0] = str(posteriors_header[i - 1]) for k in range(len(classifier_subtables)): header_col = myutils.get_column(self.posteriors, self.posteriors[0], self.posteriors[0][0]) for i in range(len(header) - 1): col = myutils.get_column(classifier_subtables[k], header, header[i]) values, counts = myutils.get_frequencies(col) for j in range(len(counts)): row_index = header_col.index(str(values[j])) header_col[row_index] = 0 col_index = self.posteriors[0].index(classifier_names[k]) self.posteriors[row_index][col_index] = counts[j] / len( classifier_subtables[k]) pass # TODO: copy your solution from PA5 here
def stratified_kfold_cross_validation(X, y, n_splits=5): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ X_train_folds = [] X_test_folds = [] header = ["index", "class label"] # append class label (y val) to each instance in X # X_copy = copy.deepcopy(X) X_indices = [] for index, instance in enumerate(X): X_indices.append([index, y[index]]) # create mypytable obj table = mypytable.MyPyTable(header, X_indices) # partition samples by class label group_names, group_subtables = myutils.group_by(table.data, table.column_names, "class label") # remove class labels from group subtables for group in group_subtables: for instance in group: instance.pop() folds = [] for _ in range(n_splits): folds.append([]) # for each group, distribute the instances one at a time to a fold loop_num = 0 for group in group_subtables: for value in group: folds[loop_num % n_splits].append(value[0]) loop_num += 1 for fold in folds: # test on fold X_test_folds.append(fold) # train on remaining folds (folds - fold) remaining_fold_indices = [] for new_fold in folds: if new_fold != fold: for val in new_fold: remaining_fold_indices.append(val) X_train_folds.append(remaining_fold_indices) return X_train_folds, X_test_folds
def stratified_kfold_cross_validation(X, y, n_splits=5): """Split dataset into stratified cross validation folds. Args: X(list of list of obj): The list of instances (samples). The shape of X is (n_samples, n_features) y(list of obj): The target y values (parallel to X). The shape of y is n_samples n_splits(int): Number of folds. Returns: X_train_folds(list of list of int): The list of training set indices for each fold. X_test_folds(list of list of int): The list of testing set indices for each fold. Notes: Loosely based on sklearn's StratifiedKFold split(): https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold """ X_train_folds = [] X_test_folds = [] # split X into folds header = ["X sample", "classifier"] table = [] i = 0 for sample in X: row = [] row.append(sample) row.append(y[i]) i += 1 table.append(row) classifiers, classifier_tables = myutils.group_by(table, header, "classifier") fold_indices = [] fold_size_1 = (len(X) // n_splits) + 1 group_1_num = len(X) % n_splits fold_size_2 = len(X) // n_splits indices = [] for i in range(len(classifier_tables[0])): for j in range(len(classifier_tables)): if i < len(classifier_tables[j]): indices.append(X.index(classifier_tables[j][i][0])) k = 0 for i in range(group_1_num): temp = [] for j in range(fold_size_1): if k < len(indices): temp.append(indices[k]) k += 1 fold_indices.append(temp) for i in range(group_1_num, n_splits): temp = [] for j in range(fold_size_2): if k < len(indices): temp.append(indices[k]) k += 1 fold_indices.append(temp) for j in range(n_splits): test_temp = fold_indices[j] X_test_folds.append(test_temp) k = 0 test_train = [] while k < len(fold_indices): if (k == j): k += 1 if (k == len(fold_indices)): # j was last index break test_train.extend(fold_indices[k]) k += 1 X_train_folds.append(test_train) return X_train_folds, X_test_folds # TODO: fix this