def decision_tree(instruction, dataset=None, preprocess=True, ca_threshold=None, text=None, test_size=0.2, drop=None): logger("Reading in dataset....") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i # Custom label encoder due to train test split y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values num_classes = len(np.unique(y)) # fitting and storing logger("Fitting Decision Tree...") clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Score found on testing set: {}".format(score)) logger("Stored model under 'decision_tree' key") clearLog() return { 'id': generate_id(), "model": clf, "target": remove, "accuracy_score": score, "preprocesser": full_pipeline, "interpeter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train, cv=3) }
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocesser( data, instruction, True, 0.2, [], 0.2, random_state=49) le = preprocessing.LabelEncoder() X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] y_train= le.fit_transform(y_train) y_test = le.fit_transform(y_test) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score( first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(X_train.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=x) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(X_train.columns[indices]) X_temp_train = X_train[X_train.columns[indices]] X_temp_test = X_test[X_train.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) print(accuracy_scores) return datas[the_index], accuracy_scores[0], max( accuracy_scores), list(columns[the_index])
def dimensionality_KPCA(instruction, dataset, target="", y=""): ''' function to reduce dimensionality in dataset via kernal principal component analysis :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param target: column name of response variable/feature :param y: dictionary of train/test data values associated with response variable/feature ''' pca = KernelPCA(kernel='rbf') dataReader = DataReader(dataset) dataset = dataReader.data_generator() data, y, target, full_pipeline = initial_preprocesser(dataset, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.transform(X_test) clf = tree.DecisionTreeClassifier() clf_mod = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod.fit(X_train_mod, y_train) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.concat( [pd.DataFrame(X_train_mod), pd.DataFrame(X_test_mod)], axis=0) y_combined = np.r_[y_train, y_test] data_modified[target] = y_combined # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def nearest_neighbors(instruction=None, dataset=None, ca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10): logger("Reading in dataset....") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values models = [] scores = [] logger("Fitting Nearest Neighbor...") logger("Identifying optimal number of neighbors...") # Tries all neighbor possibilities, based on either defaults or user # specified values for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] return { 'id': generate_id(), "model": knn, "accuracy_score": scores.index(min(scores)), "preprocesser": full_pipeline, "interpreter": label_mappings, "target": remove, "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3) } clearLog()
def dimensionality_PCA(instruction, dataset, ca_threshold=None): global counter pca = PCA(0.92) dataReader = DataReader(dataset) dataset = dataReader.data_generator() data, y, target, full_pipeline = initial_preprocesser(dataset, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.transform(X_test) clf = tree.DecisionTreeClassifier() clf_mod = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod.fit(X_train_mod, y_train) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.concat( [pd.DataFrame(X_train_mod), pd.DataFrame(X_test_mod)], axis=0) y_combined = np.r_[y_train, y_test] data_modified[target] = y_combined # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def train_xgboost(instruction, dataset=None, learning_rate=0.1, n_estimators=1000, ca_threshold=None, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', random_state=27, test_size=0.2, text=[], preprocess=True, verbosity=0, drop=None): ''' function to train a xgboost algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) if num_classes > 2: objective = 'multi:softmax' # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) # Fitting to SVM and storing in the model dictionary logger("Fitting XGBoost") clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, objective=objective, verbosity=verbosity, random_state=random_state) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'xgboost' key") clearLog() clearLog() return { 'id': generate_id(), "model": clf, "target": target, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score( clf, X_train, y_train, ), 'accuracy_score': score }, "accuracy_score": score, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } }
def decision_tree(instruction, dataset=None, preprocess=True, ca_threshold=None, text=[], test_size=0.2, drop=None, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0): ''' function to train a decision tree algorithm. :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() logger("Preprocessing data") if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target column found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) logger("Labels being mapped to appropriate classes") num_classes = len(np.unique(y)) # fitting and storing logger("Fitting Decision Tree") clf = tree.DecisionTreeClassifier( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha) clf = clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Score found on testing set: {}".format(score)) logger("Stored model under 'decision_tree' key") clearLog() return { 'id': generate_id(), "model": clf, "target": remove, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score(clf, X_train, y_train, cv=3), 'accuracy_score': score }, "accuracy_score": score, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } }
def nearest_neighbors(instruction=None, dataset=None, ca_threshold=None, preprocess=True, drop=None, min_neighbors=3, max_neighbors=10, leaf_size=30, p=2, test_size=0.2, random_state=49, algorithm='auto', text=[]): ''' function to train a nearest neighbor algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") # Reads in dataset # data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(remove)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # encodes the label dataset into 0's and 1's y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) logger("Labels being mapped to appropriate classes") models = [] scores = [] logger("Fitting nearest neighbors model") logger("Identifying optimal number of neighbors") # Tries all neighbor possibilities, based on either defaults or user # specified values num_neighbors = [] for x in range(min_neighbors, max_neighbors): knn = KNeighborsClassifier(n_neighbors=x, leaf_size=leaf_size, p=p, algorithm=algorithm) knn.fit(X_train, y_train) models.append(knn) scores.append(accuracy_score(knn.predict(X_test), y_test)) num_neighbors.append(x) logger( "->", "Optimal number of neighbors found: {}".format( num_neighbors[scores.index(max(scores))])) logger( "->", "Accuracy found on testing set: {}".format(scores[scores.index( max(scores))])) logger("Stored model under 'nearest_neighbors' key") knn = models[scores.index(min(scores))] clearLog() return { 'id': generate_id(), "model": knn, 'num_classes': num_classes, "accuracy": { 'accuracy_score': scores[scores.index(max(scores))], 'cross_val_score': cross_val_score(knn, X_train, y_train, cv=3) }, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test }, "target": remove } clearLog()
def train_svm(instruction, dataset=None, test_size=0.2, kernel='linear', text=[], preprocess=True, ca_threshold=None, drop=None, cross_val_size=0.3, degree=3, gamma='scale', coef0=0.0, max_iter=-1, random_state=49): ''' function to train a support vector machine clustering algorithm :param many params: used to hyperparametrize the function. :return a dictionary object with all of the information for the algorithm. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) logger("Preprocessing data") data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = sklearn.preprocessing.LabelEncoder() label_mappings.fit(y_vals) y_train = label_mappings.transform(y_train) y_test = label_mappings.transform(y_test) # Fitting to SVM and storing in the model dictionary logger("Fitting Support Vector Machine") clf = svm.SVC(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, max_iter=max_iter) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'svm' key") clearLog() return { 'id': generate_id(), "model": clf, 'num_classes': num_classes, "accuracy": { 'cross_val_score': cross_val_score(clf, X_train, y_train), 'accuracy_score': score }, "target": target, "preprocesser": full_pipeline, "interpreter": label_mappings, 'test_data': { 'X': X_test, 'y': y_test } } clearLog()
def regression_ann(instruction, callback=False, ca_threshold=None, text=[], dataset=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=False, save_path=os.getcwd()): ''' Body of the regression function used that is called in the neural network query if the data is numerical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() # data = pd.read_csv(self.dataset) if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(target)) X_train = data['train'] X_test = data['test'] # Target scaling target_scaler = StandardScaler() y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1)) y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1)) logger("Establishing callback function") models = [] losses = [] model_data = [] # callback function to store lowest loss value es = EarlyStopping(monitor=maximizer, mode=callback_mode, verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 # get the first 3 layer model model = get_keras_model_reg(data, i) logger("Training initial model") history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=callback_value, verbose=0) models.append(history) model_data.append(model) col_name = [[ "Initial number of layers ", "| Training Loss ", "| Test Loss " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append( "| " + str(history.history['val_loss'][len(history.history['val_loss']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] #while all(x > y for x, y in zip(losses, losses[1:])): while (len(losses) <= 2 or losses[len(losses) - 1] < losses[len(losses) - 2]): model = get_keras_model_reg(data, i) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) values = [] datax = [] values.append(str(len(model.layers))) values.append( "| " + str(history.history['loss'][len(history.history['val_loss']) - 1])) values.append("| " + str(history.history['val_loss'][ len(history.history['val_loss']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) final_model = model_data[losses.index(min(losses))] final_hist = models[losses.index(min(losses))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Loss: " + str(final_hist.history['loss'][len(final_hist.history['val_loss']) - 1])) logger( '->', "Test Loss: " + str(final_hist.history['val_loss'][len(final_hist.history['val_loss']) - 1])) # calls function to generate plots in plot generation plots = {} if generate_plots: init_plots, plot_names = generate_regression_plots( models[len(models) - 1], data, y) for x in range(len(plot_names)): plots[str(plot_names[x])] = init_plots[x] if save_model: save(final_model, save_model) # stores values in the client object models dictionary field print("") logger("Stored model under 'regression_ANN' key") clearLog() return { 'id': generate_id(), 'model': final_model, "target": target, "num_classes": 1, "plots": plots, "preprocesser": full_pipeline, "interpreter": target_scaler, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] } }
def classification_ann(instruction, callback=False, dataset=None, text=[], ca_threshold=None, preprocess=True, callback_mode='min', drop=None, random_state=49, test_size=0.2, epochs=50, generate_plots=True, maximizer="val_accuracy", save_model=False, save_path=os.getcwd()): ''' Body of the classification function used that is called in the neural network query if the data is categorical. :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained. :return dictionary that holds all the information for the finished model. ''' logger("Reading in dataset") dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, remove, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text, test_size=test_size, random_state=random_state) logger("->", "Target column found: {}".format(remove)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y = pd.concat([y['train'], y['test']], axis=0) num_classes = len(np.unique(y)) X_train = data['train'] X_test = data['test'] if num_classes > 2: # ANN needs target one hot encoded for classification one_hot_encoder = OneHotEncoder() y = pd.DataFrame(one_hot_encoder.fit_transform( np.reshape(y.values, (-1, 1))).toarray(), columns=one_hot_encoder.get_feature_names()) y_train = y.iloc[:len(X_train)] y_test = y.iloc[len(X_train):] models = [] losses = [] accuracies = [] model_data = [] logger("Establishing callback function") # early stopping callback es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5) callback_value = None if callback is not False: callback_value = [es] i = 0 model = get_keras_model_class(data, i, num_classes) logger("Training initial model") history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) model_data.append(model) models.append(history) col_name = [[ "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy " ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") values = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['val_accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax = [] datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) losses.append(history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append( history.history['val_accuracy'][len(history.history['val_accuracy']) - 1]) # keeps running model and fit functions until the validation loss stops # decreasing logger("Testing number of layers") col_name = [[ "Current number of layers", "| Training Accuracy", "| Test Accuracy" ]] col_width = max(len(word) for row in col_name for word in row) + 2 for row in col_name: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") datax = [] #while all(x < y for x, y in zip(accuracies, accuracies[1:])): while (len(accuracies) <= 2 or accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]): model = get_keras_model_class(data, i, num_classes) history = model.fit(X_train, y_train, callbacks=callback_value, epochs=epochs, validation_data=(X_test, y_test), verbose=0) values = [] datax = [] values.append(str(len(model.layers))) values.append("| " + str(history.history['accuracy'][ len(history.history['accuracy']) - 1])) values.append("| " + str(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1])) datax.append(values) for row in datax: print((" " * 2 * counter) + "| " + ("".join(word.ljust(col_width) for word in row)) + " |") del values, datax losses.append( history.history[maximizer][len(history.history[maximizer]) - 1]) accuracies.append(history.history['val_accuracy'][ len(history.history['val_accuracy']) - 1]) models.append(history) model_data.append(model) i += 1 # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl')) # del values, datax final_model = model_data[accuracies.index(max(accuracies))] final_hist = models[accuracies.index(max(accuracies))] print("") logger('->', "Best number of layers found: " + str(len(final_model.layers))) logger( '->', "Training Accuracy: " + str(final_hist.history['accuracy'][ len(final_hist.history['val_accuracy']) - 1])) logger( '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][ len(final_hist.history['val_accuracy']) - 1])) # genreates appropriate classification plots by feeding all information plots = {} if generate_plots: plots = generate_classification_plots(models[len(models) - 1], data, y, model, X_test, y_test) if save_model: save(final_model, save_model) print("") logger("Stored model under 'classification_ANN' key") clearLog() # stores the values and plots into the object dictionary return { 'id': generate_id(), "model": final_model, 'num_classes': num_classes, "plots": plots, "target": remove, "preprocesser": full_pipeline, "interpreter": one_hot_encoder, 'test_data': { 'X': X_test, 'y': y_test }, 'losses': { 'training_loss': final_hist.history['loss'], 'val_loss': final_hist.history['val_loss'] }, 'accuracy': { 'training_accuracy': final_hist.history['accuracy'], 'validation_accuracy': final_hist.history['val_accuracy'] } }
def dimensionality_ICA(instruction, dataset, target="", y=""): global counter dataReader = DataReader(dataset) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data, y, target, full_pipeline = initial_preprocesser( data, instruction, True, 0.2, [], 0.2, random_state=49) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] pca = FastICA(n_components=len(X_train.columns)) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc = [] sets = [] acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test)) frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) for i in range(2, len(X_train.columns)): pca = FastICA(n_components=i) X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.fit_transform(X_test) frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod))) frame[target] = np.r_[y_train, y_test] sets.append(frame) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train) acc.append(accuracy_score( clf_mod.predict(X_test_mod), y_test)) del i data_modified = sets[acc.index(max(acc))] score = max(acc) return data_modified, score, ((len( X_train.columns) + 1) - len(data_modified.columns))
accuracy_scores), list(columns[the_index]) def dimensionality_PCA(instruction, dataset, ca_threshold=None): ''' function to reduce dimensionality in dataset via principal component analysis method :param instruction: command sent to client instance in written query. :param dataset: data instantiated in client instance passed to the algorithm :param ca_threshold: percentage of dataset to be preprocessed using morphological component analysis ''' global counter pca = PCA(0.92) data, y, target, full_pipeline = initial_preprocesser( dataset, instruction, ca_threshold=ca_threshold, preprocess=True) X_train = data['train'] X_test = data['test'] y_train = y['train'] y_test = y['test'] X_train_mod = pca.fit_transform(X_train) X_test_mod = pca.transform(X_test) clf = tree.DecisionTreeClassifier() clf_mod = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod.fit(X_train_mod, y_train)
def train_svm(instruction, dataset=None, test_size=0.2, kernel='linear', text=None, preprocess=True, ca_threshold=None, drop=None, cross_val_size=0.3): logger("Reading in dataset....") # reads dataset and fills n/a values with zeroes #data = pd.read_csv(self.dataset) dataReader = DataReader(dataset) data = dataReader.data_generator() if drop is not None: data.drop(drop, axis=1, inplace=True) data, y, target, full_pipeline = initial_preprocesser( data, instruction, preprocess, ca_threshold, text) logger("->", "Target Column Found: {}".format(target)) X_train = data['train'] y_train = y['train'] X_test = data['test'] y_test = y['test'] # classification_column = get_similar_column(getLabelwithInstruction(instruction), data) num_classes = len(np.unique(y)) # Needed to make a custom label encoder due to train test split changes # Can still be inverse transformed, just a bit of extra work y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0)) label_mappings = {} for i in range(len(y_vals)): label_mappings[y_vals[i]] = i y_train = y_train.apply(lambda x: label_mappings[x]).values y_test = y_test.apply(lambda x: label_mappings[x]).values # Fitting to SVM and storing in the model dictionary logger("Fitting Support Vector Machine...") clf = svm.SVC(kernel=kernel) clf.fit(X_train, y_train) score = accuracy_score(clf.predict(X_test), y_test) logger("->", "Accuracy found on testing set: {}".format(score)) logger('->', "Stored model under 'svm' key") return { 'id': generate_id(), "model": clf, "accuracy_score": accuracy_score(clf.predict(X_test), y_test), "target": target, "preprocesser": full_pipeline, "interpreter": label_mappings, "cross_val_score": cross_val_score(clf, X_train, y_train) } clearLog()