def dimensionality_KPCA(instruction, dataset, target="", y=""): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf") data_modified = kpca.fit_transform(dataset) X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split( data_modified, y, test_size=0.2, random_state=49) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train_mod) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test_mod)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train_mod) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.DataFrame(data_modified) data_modified[target] = np.r_[y_train, y_test] # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns)) def booster(dataset, obj): #obj=["reg:linear","multi:softmax "] X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) clf = XGBClassifier(objective=obj, learning_rate=0.1, silent=1, alpha=10) clf.fit(X_train, y_train) return accuracy_score(clf.predict(X_test_mod), y_test_mod)
def decision_tree(instruction, dataset=None, preprocess=True, mca_threshold=None, test_size=0.2, drop=None): logger("Reading in dataset....") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i # Custom label encoder due to train test split y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values num_classes = len(np.unique(y)) # fitting and storing logger("Fitting Decision Tree...") clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Score found on testing set: {}".format(score)) print("") logger("Stored model under 'decision_tree' key") clearLog() return { 'id': generate_id(), "model": clf, "target": remove, "accuracy_score": score, "preprocesser": full_pipeline, "interpeter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train, cv=3) }
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score(first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max(accuracy_scores), list( columns[the_index])
def nearest_neighbors(instruction=None, dataset=None, mca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10): logger("Reading in dataset....") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values models = [] scores = [] logger("Fitting Nearest Neighbor...") logger("Identifying optimal number of neighbors...") # Tries all neighbor possibilities, based on either defaults or user # specified values for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] return { 'id': generate_id(), "model": knn, "accuracy_score": scores.index(min(scores)), "preprocesser": full_pipeline, "interpreter": label_mappings, "target": remove, "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3) } clearLog()
def regression_ann( instruction, mca_threshold=None, dataset = None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=True, save_path=os.getcwd()): global currLog logger("reading in dataset...") dataReader = DataReader(dataset) data = dataReader.data_generator() # data = pd.read_csv(self.dataset) if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser(data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(target)) X_train = data['train'] X_test = data['test'] # Target scaling target_scaler = StandardScaler() y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1)) y_test = target_scaler.transform(np.array(y['test']).reshape(-1,1)) logger("establishing callback function...") models = [] losses = [] model_data = [] # callback function to store lowest loss value es = EarlyStopping( monitor=maximizer, mode=callback_mode, verbose=0, patience=5) i = 0 # get the first 3 layer model model = get_keras_model_reg(data, i) logger("training initial model...") history = model.fit( X_train, y_train, epochs=epochs, validation_data=( X_test, y_test), callbacks=[es], verbose=0) models.append(history) model_data.append(model) logger("->", "Initial number of layers " + str(len(model.layers))) logger("->", "Training Loss: " + \ str(history.history['loss'][len(history.history['val_loss']) - 1]), '|') logger("->", "Test Loss: " + str(history.history['val_loss'][len(history.history['val_loss']) - 1]), '|') print("") losses.append(history.history[maximizer] [len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("testing number of layers...") print(currLog) while (all(x > y for x, y in zip(losses, losses[1:]))): model = get_keras_model_reg(data, i) history = model.fit( X_train, y_train, epochs=epochs, validation_data=( X_test, y_test), verbose=0) model_data.append(model) models.append(history) logger("->", "Current number of layers: " + str(len(model.layers))) logger("->", "Training Loss: " + str(history.history['loss'][len(history.history['val_loss']) - 1]), '|') logger("->", "Test Loss: " + str(history.history['val_loss'][len(history.history['val_loss']) - 1]), '|') print("") losses.append(history.history[maximizer] [len(history.history[maximizer]) - 1]) i += 1 final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger('->', "Training Loss: " + str(final_hist.history['loss'] [len(final_hist.history['val_loss']) - 1])) logger('->', "Test Loss: " + str(final_hist.history['val_loss'] [len(final_hist.history['val_loss']) - 1])) # calls function to generate plots in plot generation if generate_plots: init_plots, plot_names = generate_regression_plots( models[len(models) - 1], data, y) plots = {} for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] if save_model: save(final_model, save_model) # stores values in the client object models dictionary field print("") logger("Stored model under 'regression_ANN' key") return { 'id': generate_id(), 'model': final_model, "target": target, "plots": plots, "preprocesser": full_pipeline, "interpreter": target_scaler, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss']}}
def classification_ann(instruction, dataset=None, mca_threshold=None, preprocess=True, callback_mode='min', drop=None, random_state=49, test_size=0.2, epochs=50, generate_plots=True, maximizer="val_loss", save_model=True, save_path=os.getcwd()): global currLog logger("reading in dataset...") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(remove)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y = pd.concat([y['train'], y['test']], axis=0) num_classes = len(np.unique(y)) X_train = data['train'] X_test = data['test'] # ANN needs target one hot encoded for classification one_hot_encoder = OneHotEncoder() y = pd.DataFrame(one_hot_encoder.fit_transform(np.reshape(y.values, (-1,1))).toarray(), columns=one_hot_encoder.get_feature_names()) y_train = y.iloc[:len(X_train)] y_test = y.iloc[len(X_train):] models = [] losses = [] accuracies = [] model_data = [] logger("establishing callback function...") # early stopping callback es = EarlyStopping( monitor=maximizer, mode='min', verbose=0, patience=5) i = 0 model = get_keras_model_class(data, i, num_classes) logger("training initial model...") history = model.fit( X_train, y_train, epochs=epochs, validation_data=( X_test, y_test), callbacks=[es], verbose=0) model_data.append(model) models.append(history) logger("->", "Initial number of layers " + str(len(model.layers))) logger("->", "Training Loss: " + \ str(history.history['loss'][len(history.history['val_loss']) - 1]), '|') logger("->", "Test Loss: " + str(history.history['val_loss'][len(history.history['val_loss']) - 1]), '|') print("") losses.append(history.history[maximizer] [len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("testing number of layers...") while (all(x > y for x, y in zip(losses, losses[1:]))): model = get_keras_model_class(data, i, num_classes) history = model.fit( X_train, y_train, epochs=epochs, validation_data=( X_test, y_test), callbacks=[es], verbose=0) model_data.append(model) models.append(history) logger("->", "Current number of layers: " + str(len(model.layers))) logger("->", "Training Loss: " + str(history.history['loss'][len(history.history['val_loss']) - 1]), '|') logger("->", "Test Loss: " + str(history.history['val_loss'][len(history.history['val_loss']) - 1]), '|') print("") losses.append(history.history[maximizer] [len(history.history[maximizer]) - 1]) accuracies.append(history.history['val_accuracy'] [len(history.history['val_accuracy']) - 1]) i += 1 final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger('->', "Training Accuracy: " + str(final_hist.history['accuracy'] [len(final_hist.history['val_accuracy']) - 1])) logger('->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][ len(final_hist.history['val_accuracy']) - 1])) # genreates appropriate classification plots by feeding all information plots = generate_classification_plots( models[len(models) - 1], data, y, model, X_test, y_test) if save_model: save(final_model, save_model) print("") logger("Stored model under 'classification_ANN' key") # stores the values and plots into the object dictionary return { 'id': generate_id(), "model": final_model, 'num_classes': num_classes, "plots": plots, "target": remove, "preprocesser": full_pipeline, "interpreter": one_hot_encoder, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss']}, 'accuracy': { 'training_accuracy': final_hist.history['accuracy'], 'validation_accuracy': final_hist.history['val_accuracy']}}
def k_means_clustering(dataset=None, preprocess=True, generate_plots=True, drop=None, base_clusters=1): logger("Reading dataset...") # loads dataset and replaces n/a with zero # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) dataPandas = data.copy() full_pipeline = None if preprocess: logger("Preprocessing data...") data, full_pipeline = clustering_preprocessor(data) data = np.array(data) modelStorage = [] inertiaStor = [] # processes dataset and runs KMeans algorithm on one cluster as # baseline i = base_clusters logger("Creating unsupervised clustering task...") kmeans = KMeans(n_clusters=i, random_state=0).fit(data) modelStorage.append(kmeans) # stores SSE values in an array for later comparison inertiaStor.append(kmeans.inertia_) i += 1 logger("Identifying best centroid count and optimizing accuracy") # continues to increase cluster size until SSE values don't decrease by # 1000 - this value was decided based on precedence while (all(earlier >= later for earlier, later in zip(inertiaStor, inertiaStor[1:]))): kmeans = KMeans(n_clusters=i, random_state=0).fit(data) modelStorage.append(kmeans) inertiaStor.append(kmeans.inertia_) # minimize inertia up to 10000 i += 1 # checks to see if it should continue to run; need to improve this # algorithm if i > 3 and inertiaStor[len(inertiaStor) - 2] - 1000 <= inertiaStor[ len(inertiaStor) - 1]: break # generates the clustering plots approiately logger("->", "Optimal number of clusters found: {}".format(i)) if generate_plots: logger("Generating plots and storing in model") init_plots, plot_names = generate_clustering_plots( modelStorage[len(modelStorage) - 1], dataPandas, data) plots = {} for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] print("") logger("Stored model under 'k_means_clustering' key") # stores plots and information in the dictionary client model return { 'id': generate_id(), "model": modelStorage[len(modelStorage) - 1], "preprocesser": full_pipeline, "plots": plots } clearLog()
def train_svm(instruction, dataset=None, test_size=0.2, kernel='linear', preprocess=True, mca_threshold=None, drop=None, cross_val_size=0.3): logger("Reading in dataset....") # reads dataset and fills n/a values with zeroes #data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, mca_threshold) logger("->", "Target Column Found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values # Fitting to SVM and storing in the model dictionary logger("Fitting Support Vector Machine") clf = svm.SVC(kernel=kernel) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger("Stored model under 'svm' key") return { 'id': generate_id(), "model": clf, "accuracy_score": accuracy_score(clf.predict(X_test), y_test), "target": target, "preprocesser": full_pipeline, "interpreter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train) } clearLog()
def dimensionality_reduc(instruction, dataset, arr=["RF", "PCA", "KPCA", "ICA"], inplace=False): global currLog global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] print("") print("Baseline Accuracy: " + str(finals[0][1])) print("----------------------------") for i, element in product(range(len(finals)), finals): print("Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])) if finals[0][1] < element[2]: accs.append( list([ "Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2]) ])) print("") print("Best Accuracies") print("----------------------------") for element in accs: print(element) if inplace: data.to_csv(dataset)
def tune_helper(model_to_tune=None, dataset=None, models=None, max_layers=10, min_layers=2, min_dense=32, max_dense=512, executions_per_trial=3, max_trials=1, activation='relu', loss='categorical_crossentropy', metrics='accuracy'): logger("Getting target model for tuning...") # checks to see which requested model is in the self.models # processing for regression feed forward NN if model_to_tune == 'regression_ANN': logger("Tuning model hyperparameters") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['regression_ANN']['target'] target_column = data[models['regression_ANN']['target']] data = models['regression_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model = tuneReg(data, target_column, max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials) models['regression_ANN'] = {'model': returned_model} return returned_model # processing for classification feed forward NN if model_to_tune == "classification_ANN": logger("Tuning model hyperparameters") dataReader = DataReader(dataset) data = dataReader.data_generator() target = models['classification_ANN']['target'] target_column = data[models['classification_ANN']['target']] data = models['classification_ANN']['preprocesser'].transform( data.drop(target, axis=1)) returned_model = tuneClass(data, target_column, models['classification_ANN']['num_classes'], max_layers=max_layers, min_layers=min_layers, min_dense=min_dense, max_dense=max_dense, executions_per_trial=executions_per_trial, max_trials=max_trials, activation=activation, loss=loss, metrics=metrics) models['classification_ANN'] = {'model': returned_model} return returned_model # processing for convolutional NN if model_to_tune == "convolutional_NN": logger("Tuning model hyperparameters") X = models['convolutional_NN']["X"] y = models['convolutional_NN']["y"] model = tuneCNN(np.asarray(X), np.asarray(y), models["convolutional_NN"]["num_classes"]) models["convolutional_NN"]["model"] = model return models