def test_naive_bayes_classifier_predict(): train = [[1, 5], [2, 6], [1, 5], [1, 5], [1, 6], [2, 6], [1, 5], [1, 6]] y = ["yes", "yes", "no", "no", "yes", "no", "yes", "yes"] nb = MyNaiveBayesClassifier() nb.fit(train, y) pred = nb.predict([[1, 5]]) assert pred == ["yes"] # TODO: fix this # RQ5 (fake) iPhone purchases dataset iphone_col_names = [ "standing", "job_status", "credit_rating", "buys_iphone" ] iphone_table = [[1, 3, "fair", "no"], [1, 3, "excellent", "no"], [2, 3, "fair", "yes"], [2, 2, "fair", "yes"], [2, 1, "fair", "yes"], [2, 1, "excellent", "no"], [2, 1, "excellent", "yes"], [1, 2, "fair", "no"], [1, 1, "fair", "yes"], [2, 2, "fair", "yes"], [1, 2, "excellent", "yes"], [2, 2, "excellent", "yes"], [2, 3, "fair", "yes"], [2, 2, "excellent", "no"], [2, 3, "fair", "yes"]] mypy = MyPyTable(iphone_col_names, iphone_table) y2 = myutils.get_mypycol(mypy, "buys_iphone") nb2 = MyNaiveBayesClassifier() nb2.fit(iphone_table, y2) pred2 = nb2.predict([[1, 2, "fair"]]) assert pred2 == ["yes"] # Bramer 3.2 train dataset train_col_names = ["day", "season", "wind", "rain", "class"] train_table = [["weekday", "spring", "none", "none", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "high", "heavy", "late"], ["saturday", "summer", "normal", "none", "on time"], ["weekday", "autumn", "normal", "none", "very late"], ["holiday", "summer", "high", "slight", "on time"], ["sunday", "summer", "normal", "none", "on time"], ["weekday", "winter", "high", "heavy", "very late"], ["weekday", "summer", "none", "slight", "on time"], ["saturday", "spring", "high", "heavy", "cancelled"], ["weekday", "summer", "high", "slight", "on time"], ["saturday", "winter", "normal", "none", "late"], ["weekday", "summer", "high", "none", "on time"], ["weekday", "winter", "normal", "heavy", "very late"], ["saturday", "autumn", "high", "slight", "on time"], ["weekday", "autumn", "none", "heavy", "on time"], ["holiday", "spring", "normal", "slight", "on time"], ["weekday", "spring", "normal", "none", "on time"], ["weekday", "spring", "normal", "slight", "on time"]] mypy2 = MyPyTable(train_col_names, train_table) y3 = myutils.get_mypycol(mypy2, "class") nb3 = MyNaiveBayesClassifier() nb3.fit(train_table, y3) nb3.fit(train_table, y3) pred3 = nb3.predict([["weekday", "winter", "high", "heavy"]]) assert pred3 == ["cancelled"]
def confusionCategorical(yTrue, yTest, header, categories): table = MyPyTable() table.column_names = header table.data = [] for val in categories: newRow = [val] for i in range(len(header) - 1): newRow.append(0) table.data.append(newRow) for i in range(len(yTrue)): rowIndex = categories.index(yTrue[i]) colIndex = header.index(yTest[i]) table.data[rowIndex][colIndex] += 1 for row in table.data: total = 0 for i in range(1, len(categories) + 1): total += row[i] row[len(categories) + 1] = total for i in range(len(table.data)): if table.data[i][len(categories) + 1] != 0: recognition = table.data[i][i + 1] / table.data[i][len(categories) + 1] table.data[i][len(header) - 1] = round(100 * recognition, 2) return table
def scatter_plot(table, x_column_name, y_column_name): """Creates a scatter plot with given data Args: table(MyPyTable): given table to perform operation column_name(string): column name to get column from for scatter plot. Column on the x axis y_column_name(string): column name to get column from for scatter plot. Column on the y axis Returns: coeficient(float): coeficient value cov(float): covariance value """ y_col = MyPyTable.get_column(table, y_column_name, False) x_col = MyPyTable.get_column(table, x_column_name, False) coeficient = utils.correlation_coeficient(x_col, y_col) cov = utils.covariance(x_col, y_col) m, b = utils.compute_slope_intercept(x_col, y_col) plt.scatter(x_col, y_col) plt.plot([min(x_col), max(x_col)], [m * min(x_col) + b, m * max(x_col) + b], c="r", label="corr: " + str(coeficient) + ", cov: " + str(cov)) plt.legend() plt.plot() plt.show() return coeficient, cov
def get_sea_frequencies(MyPyTable, col_name): """Gets the frequency and count of a column by name Args: MyPyTable(MyPyTable): self of MyPyTable col_name(str): name of the column Returns: values, counts (string, int): name of value and its frequency""" rain_col = MyPyTable.get_column(col_name) row_index_to_drop = [] print("range:", len(rain_col), len(MyPyTable.data)) for i in range(len(rain_col)): if rain_col[i] == "FALSE": row_index_to_drop.append(i) count = 0 row_to_drop = [] for i in range(len(MyPyTable.data)): if i in row_index_to_drop: row_to_drop.append(MyPyTable.data[i]) MyPyTable.drop_rows(row_to_drop) months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] month_col = MyPyTable.get_column('DATE') yes_col = [] for month in months: yes = 0 for i in range(len(month_col)): if month in month_col[i]: yes = yes + 1 yes_col.append(yes) return months, yes_col
def get_ratings_genre(table, genre, rating): """Get list with ratings attached with given genre column Args: table(MyPyTable): given object of MyPyTable genre(string): Genre to search for in table rating(string): Service provider to pull from in get column Returns: list(list): list with ratings from each correctly found genre""" genre_col = MyPyTable.get_column(table, 'Genres', True) col = MyPyTable.get_column(table, rating, True) list = [] for i in range(len(genre_col)): if genre in genre_col[i]: if rating == 'Rotten Tomatoes' and '%' in col[i]: col[i] = float(col[i].strip('%')) list.append(col[i]) copy_list = copy.deepcopy(list) for value in list: if value == '': copy_list.remove(value) list = copy_list return list
def test_My_Random_Forest_Classifier_predict(): # Object Declarations # Tests with N = 3, M = 2, F = 2 and seed = 1 rand_forest_test = MyRandomForestClassifier(3, 2, 2, 1) table = MyPyTable() # Variable Assignment and Declaration table.data = interview_table table.column_names = interview_header y_train, X_train = [], [] for inst in interview_table: y_train.append(inst[-1]) X_train.append(inst[:-1]) # Sets X_test X_test = [["Junior", "Java", "yes", "no"], ["Junior", "Java", "yes", "yes"]] # Tests on the Interview Dataset rand_forest_test.header = interview_header[:-1] rand_forest_test.fit(X_train, y_train) y_predicted = rand_forest_test.predict(X_test) print("y_predicted:", y_predicted) # Trace Test assert y_predicted == ['True', 'False']
def get_aus_frequencies(MyPyTable, col_name): """Gets the frequency and count of a column by name Args: MyPyTable(MyPyTable): self of MyPyTable col_name(str): name of the column Returns: values, counts (string, int): name of value and its frequency""" rain_col = MyPyTable.get_column("RainToday") row_index_to_drop = [] for i in range(len(rain_col)): if rain_col[i] == "No": row_to_drop.append(i) row_to_drop = [] for i in range(len(MyPyTable.data)): if i in row_index_to_drop: row_to_drop.append(i) table = MyPyTable.drop_rows(rows_to_drop) table.pretty_print() return table values = [] counts = [] '''for value in col:
def test_random_forest_fit(): interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"] interview_table = [["Senior", "Java", "no", "no", "False"], ["Senior", "Java", "no", "yes", "False"], ["Mid", "Python", "no", "no", "True"], ["Junior", "Python", "no", "no", "True"], ["Junior", "R", "yes", "no", "True"], ["Junior", "R", "yes", "yes", "False"], ["Mid", "R", "yes", "yes", "True"], ["Senior", "Python", "no", "no", "False"], ["Senior", "R", "yes", "no", "True"], ["Junior", "Python", "yes", "no", "True"], ["Senior", "Python", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"], ["Mid", "Java", "yes", "no", "True"], ["Junior", "Python", "no", "yes", "False"]] myutils.prepend_attribute_label(interview_table, interview_header) interview_pytable = MyPyTable(column_names=interview_header, data=interview_table) y_col = interview_pytable.get_column("interviewed_well", False) x_cols = interview_pytable.drop_col("interviewed_well") many_trees = MyRandomForestClassifier() X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col) X_train, X_test, y_train, y_test = myutils.train_test_split( X_sample, y_sample, .33) many_trees.fit(X_train, y_train, X_test, y_test) y_predicted = many_trees.predict(X_test) numCorrectPredictions = 0 numWrongPredictions = 0 for i in range(len(y_test)): values = [y_predicted[i], y_test[i]] #predicted/actual if (values[0] == values[1]): numCorrectPredictions = numCorrectPredictions + 1 else: numWrongPredictions = numWrongPredictions + 1 accuracy = np.round((numCorrectPredictions) / (numCorrectPredictions + numWrongPredictions), 3) error_rate = np.round( (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions), 3) print("-----------------------------------------------------------") print("Accuracy and Error Rate") print("-----------------------------------------------------------") print() print("Random Forest: accuracy = {}, error rate = {}".format( accuracy, error_rate)) print() print( "Because of the random aspect of this classifier, this will not always pass the tests" ) print() print("Predicted table: " + str(y_predicted)) print("Testing set: " + str(y_test)) for i in range(len(y_test)): assert y_predicted[i] == y_test[i]
def bagging(X, Y, N, M, F): # 1. split your dataset into a test set and a "remainder set" x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y) # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier # for each N sample: # ~63% of the remainder set will be sampled into training set # ~37% will be leftover for this tree's validation set forest = [] # accuracies = [[0] for i in range(N)] accuracies = {} for i in range(N): x_train, y_train = compute_bootstrapped_sample( x_remainder, y_r) #get the bootstrap sample tree = my_class.MyDecisionTreeClassifier() tree.fit(x_train, y_train, True, F) #build classifier # get remainder of x_train and use as validation set x_v = [] y_v = [] for j in range(len(x_remainder)): if x_remainder[j] not in x_train: x_v.append(x_remainder[j]) y_v.append(y_r[j]) pred = tree.predict(x_v) accuracy = get_accuracy(y_v, pred) accuracies[str(i)] = accuracy # {i: accuracy, } forest.append(tree) # 3. measure the performance of the tree on the validation set and select the best M of N # trees based on the performance metrics best_trees_dict = best_M(M, accuracies) best_trees = [] for key in best_trees_dict: best_trees.append(forest[int(key)]) # 4. using majority voting, make predictions from the M learners for each instance in the test set all_predictions = [] # [[predictions1],[predictions2]...] for tree in best_trees: pred = tree.predict(x_test) all_predictions.append(pred) #think about this like flipping a table #get the majority for every single row pred_header = build_header( all_predictions) #turn all predictions into a mypy pred_mypy = MyPyTable(pred_header, all_predictions) voted_predictions = [] for i in range( len(all_predictions[0]) ): #loop through every x_test, create a column of predictions, pick the pred by majority rule pred_col = pred_mypy.get_column(i) vals, counts = get_freq_str(pred_col) j = counts.index(max(counts)) y_predict = vals[j] voted_predictions.append(y_predict) forest_accuracy = get_accuracy(y_test, voted_predictions) return best_trees, voted_predictions, forest_accuracy
def test_random_forest_classifier_fit(): mp_table = MyPyTable(interview_header, interview_table) # Formulate X_train and y_train y_train = mp_table.get_column('interviewed_well') X_train_col_names = ["level", "lang", "tweets", "phd"] X_train = mp_table.get_rows(X_train_col_names) myRF = MyRandomForestClassifier(N=4, M=2, F=4) myRF.fit(X_train, y_train) assert len(myRF.M_attr_sets) == myRF.M
def unique_genres(table): """Get list of unique genres within a table Args: table(MyPyTable): given object of MyPyTable Returns: values(list): list with unique genres""" genre_str = '' genre_col = MyPyTable.get_column(table, 'Genres', False) vals, counts = get_frequencies(table, 'Genres') for v in vals: genre_str = genre_str + v + ',' genre_array = genre_str.split(',') values = [] for value in genre_array: if value != '': if value not in values: # haven't seen this value before values.append(value) elif value in values: pass return values
def test_My_Random_Forest_Classifier_fit(): # Object Declarations # Tests with N = 3, M = 2, F = 2 and seed = 0 rand_forest_test = MyRandomForestClassifier(3, 2, 2, 0) table = MyPyTable() # Variable Assignment and Declaration table.data = interview_table table.column_names = interview_header X_test = interview_table y_train = table.get_column("interviewed_well") # Tests on the Interview Dataset rand_forest_test.header = interview_header rand_forest_test.fit(X_test, y_train) trees = rand_forest_test.trees
def table_setUp(file_name): """ """ file_path = os.path.join("input_data", file_name) # Inputs data from file into the table table = MyPyTable().load_from_file(file_path) return table
def random_forest_predict(X_test, trees): # 4. using majority voting, make predictions from the M learners for each instance in the test set all_predictions = [] # [[predictions1],[predictions2]...] for tree in trees: pred = tree.predict(X_test) all_predictions.append(pred) #think about this like flipping a table #get the majority for every single row pred_header = build_header(all_predictions) #turn all predictions into a mypy pred_mypy = MyPyTable(pred_header, all_predictions) voted_predictions = [] for i in range(len(all_predictions[0])): #loop through every x_test, create a column of predictions, pick the pred by majority rule pred_col = pred_mypy.get_column(i) vals, counts = get_freq_str(pred_col) j = counts.index(max(counts)) y_predict = vals[j] voted_predictions.append(y_predict) # forest_accuracy = get_accuracy(y_test, voted_predictions) return voted_predictions
def test_random_forest_classifier_predict(): X_test = [["Mid", "Python", "no", "no", "True"], ["Mid", "R", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"]] y_test = ["True", "True", "True"] mp_table = MyPyTable(interview_header, interview_table) # Formulate X_train and y_train y_train = mp_table.get_column('interviewed_well') X_train_col_names = ["level", "lang", "tweets", "phd"] X_train = mp_table.get_rows(X_train_col_names) myRF = MyRandomForestClassifier(N=4, M=2, F=4) myRF.fit(X_train, y_train) predictions = myRF.predict(X_test) for i in range(0, len(predictions)): assert predictions[i] == y_test[i]
def get_freq_str(col): header = ["y"] col_mypy = MyPyTable(header, col) dups = col_mypy.ordered_col(header) values = [] counts = [] for value in dups: if value not in values: # first time we have seen this value values.append(str(value)) counts.append(1) else: # we have seen this value before counts[-1] += 1 # ok because the list is sorted return values, counts
def pie_chart_dataPrep(table, cols_to_plot): """ """ totals_list = [] for col_name in cols_to_plot: column_Category = MyPyTable.get_column(table, str(col_name)) total = 0 for val in column_Category: total = total + float(val) totals_list.append(total) return totals_list
def hist_graph(table, column_name): """Creates a histogram graph with given data Args: table(MyPyTable): given table to perform operation column_name(string): column name to get column from for hist graph """ col = MyPyTable.get_column(table, column_name, False) plt.hist(col, bins=10) plt.show()
def get_year_counts(table, platform): """Get years of occuring platform game occurences along with their individual frequencies Args: table(MyPyTable): given object of MyPyTable platform(string): platform to search for in table Returns: values, counts (string, int): name of value and its frequency" """ plat_col = MyPyTable.get_column(table, 'Platform', True) col = MyPyTable.get_column(table, "Year", True) list = [] for i in range(len(plat_col)): if plat_col[i] == platform: list.append(col[i]) copy_list = copy.deepcopy(list) for value in list: if value == 'N/A': copy_list.remove(value) list = copy_list list.sort() values = [] counts = [] for value in list: if value not in values: # haven't seen this value before values.append(value) counts.append(1) elif value in values: index = values.index(value) counts[index] += 1 return values, counts
def convert_attributes(table): """Converts IMDb to double digit float and Rotten Tomatoes to string without % Args: table(MyPyTable): given object of MyPyTable Returns: imbd_col(list): IMDb list in double digits rotten_col(list): Rotten Tomatoes list stripped of %""" #IMDb conversion col = MyPyTable.get_column(table, 'IMDb', False) rotten_col = MyPyTable.get_column(table, 'Rotten Tomatoes', False) imbd_col = [] for i in col: i = i * 10 imbd_col.append(i) #rotten tomatoes conversion for a, x in enumerate(rotten_col): rotten_col[a] = float(x[:-1]) return imbd_col, rotten_col
def get_mpg_frequencies(MyPyTable, col_name): """Gets the frequency and count of a column by name Args: MyPyTable(MyPyTable): self of MyPyTable col_name(str): name of the column Returns: values, counts (string, int): name of value and its frequency""" col = MyPyTable.get_column(col_name) values = [] counts = [] for value in col: if value not in values: # haven't seen this value before if value >= 13 and value < 14: values, counts = mpg_val_check(1, values, counts, value) elif value == 14: values, counts = mpg_val_check(2, values, counts, value) elif value > 14 and value <= 16: values, counts = mpg_val_check(3, values, counts, value) elif value > 16 and value <= 19: values, counts = mpg_val_check(4, values, counts, value) elif value > 19 and value <= 23: values, counts = mpg_val_check(5, values, counts, value) elif value > 23 and value <= 26: values, counts = mpg_val_check(6, values, counts, value) elif value > 26 and value <= 30: values, counts = mpg_val_check(7, values, counts, value) elif value > 30 and value <= 36: values, counts = mpg_val_check(8, values, counts, value) elif value > 36 and value <= 44: values, counts = mpg_val_check(9, values, counts, value) elif value >= 45: values, counts = mpg_val_check(10, values, counts, value) temp_counts = copy.deepcopy(counts) #re-order/sort values and temp_counts for i in range(len(values)): index = values[i] temp_counts[index - 1] = counts[i] values.sort() counts = temp_counts return values, counts
def compute_entropy(instances, available_attributes, index): mypy = MyPyTable(available_attributes, instances) classes = mypy.get_column(-1) attributes = mypy.get_column(index) temp = set(attributes) __, tables = group_by(attributes, classes) totals = [] sub_entropies = [] # get the class counts here for jj, element in enumerate(temp): totals.append(attributes.count(element)) # parallel array of counts of each att for each class arr = [] for table in tables: arr.append(table.count(element)) su = 0 for kk in arr: if kk <= 0: pass else: su -= kk / totals[jj] * math.log2(kk / totals[jj]) su *= totals[jj] / len(attributes) sub_entropies.append(su) return sum(sub_entropies)
def percent_hist_graph(table, column_name): """Creates a histogram graph with given data and removes the percent sign from given column_names Args: table(MyPyTable): given table to perform operation column_name(string): column name to get column from for hist graph """ col = MyPyTable.get_column(table, column_name, False) for i, x in enumerate(col): col[i] = float(x[:-1]) plt.hist(col, bins=10) plt.show()
def combine_two_columns(column_names, col1, col2): """Creates a MyPyTable from two columns and their column names Args: column_names(list): List of string column names col1(list): List of values from first column col2(list): List of values from second column Returns: table(MyPyTable): Returned MyPyTable with two columns""" data = [] for i in range(len(col1)): data.append([col1[i], col2[i]]) table = MyPyTable(column_names, data) return table
def percentages_columns(table, column_names): """Gives the percentage of each column's frequency divided by total column length Args: table(MyPyTable): given object of MyPyTable column_names(list): List of string column names Returns: percentages(list): list of percentages in each correct index matching with the given columns list""" counts = get_occurences_given_columns(table, column_names) percentages = [] col = MyPyTable.get_column(table, column_names[0], False) length = len(col) for count in counts: percentages.append(round((count / length) * 100, 0)) return percentages
def get_occurences_given_columns(table, column_names): """Gets the occurence from each column in a given columns list. Args: table(MyPyTable): given object of MyPyTable column_names(list): List of string column names Returns: count(list): list of frequencies in each correct index matching with the given columns list""" column = [] count = [] for i in range(len(column_names)): count.append(0) for col in column_names: attributes = MyPyTable.get_column(table, col, False) column.append(attributes) for i in range(len(column)): for j in column[i]: if j == 1.0: count[i] = count[i] + 1 return count
def get_frequencies(MyPyTable, col_name): """Gets the frequency and count of a column by name Args: MyPyTable(MyPyTable): self of MyPyTable col_name(str): name of the column Returns: values, counts (string, int): name of value and its frequency""" col = MyPyTable.get_column(col_name) values = [] counts = [] for value in col: if value not in values: # haven't seen this value before values.append(value) counts.append(1) elif value in values: index = values.index(value) counts[index] += 1 return values, counts
track_data = [] popularity = track["track"]["popularity"] if popularity == 0: continue #skip any track with 0 popularity, because I'm unsure if this is a default value else: #name = track["track"]["name"] #track_data.append(name) # will be ignored but could but each track_data_obj should be identifiable features_dict = sp.audio_features(track["track"]["id"]) #this returns a features dictonary for key in features_dict[0]: #loop through and add only the attributes we want if key != "type" and key != "id" and key != "uri" and key != "track_href" and key != "analysis_url" and key != "time_signature" and key != "mode" and key != "key" and key != "loudness": val = features_dict[0][key] if key != "tempo" and key != "duration_ms": val = myutils.percent_to_rating(val) track_data.append(val) # if first == True: # header.append(key) # first = False pop_class = myutils.pop_rating(popularity) track_data.append(pop_class) # popularity will be the y_train track_data_objs.append(track_data) # header.append("popularity") # now we can turn this into an xtrain and ytrain or keep it stitched together # when dealing with the data we can delete the first col, which is the name identifier print(len(track_data_objs)) header = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'] tracks_mypy = MyPyTable(header, track_data_objs) tracks_mypy.save_to_file("tracks_data.txt")
import pickle # standard python library from mysklearn.mypytable import MyPyTable from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyNaiveBayesClassifier import mysklearn.myevaluation as myevaluation import mysklearn.myutils as myutils import os # "pickle" an object (AKA object serialization) # save a Python object to a binary file # "unpickle" an object (AKA object de-serialization) # load a Python object from a binary file (back into memory) # Get data from csv file table = MyPyTable().load_from_file( os.path.join("input_files", "winequality-red.csv")) y_col = table.get_column("quality", False) x_cols = table.drop_col("quality") # Use Naive Bayes to classify testcase = MyNaiveBayesClassifier() #Returns x INDEXES X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols, y_col, n_splits=10) X_train, X_test, y_train, y_test = myutils.getInstances( X_train, X_test, x_cols, y_col) for i, fold in enumerate(X_train): train, test = myutils.normalize_values(X_train[i], X_test[i])
from mysklearn.myclassifiers import MyNaiveBayesClassifier import os from mysklearn.mypytable import MyPyTable import mysklearn.myevaluation as myevaluation import mysklearn.myutils as myutils import pickle fname = os.path.join("input_data", "collisions.csv") collisions_data = MyPyTable().load_from_file(fname) weather = collisions_data.get_column('WEATHER') road_condition = collisions_data.get_column('ROADCOND') light_condition = collisions_data.get_column('LIGHTCOND') junction_type = collisions_data.get_column('JUNCTIONTYPE') severity = collisions_data.get_column('SEVERITYDESC') X_train = [[ weather[i], road_condition[i], light_condition[i], junction_type[i], severity[i] ] for i in range(len(weather))] y_train = collisions_data.get_column('COLLISIONTYPE') for i, val in enumerate(y_train): if val == 'Unknown': del y_train[i] del X_train[i] strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation( X_train, y_train, 10) strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds( X_train, y_train, strattrain_folds, strattest_folds)