def get_target_values(data, instruction, yLabel): # Get target columns target = get_similar_column(get_value_instruction(instruction), data) X = data[target] del data[target] #labels Y = data[get_similar_column(get_value_instruction(yLabel), data)] return X, Y
def dimensionality_KPCA(instruction, dataset, target="", y=""): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) kpca = KernelPCA(n_components=len(dataset.columns), kernel="rbf") data_modified = kpca.fit_transform(dataset) X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split( data_modified, y, test_size=0.2, random_state=49) clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) clf_mod = tree.DecisionTreeClassifier() clf_mod.fit(X_train_mod, y_train_mod) acc = [] acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test_mod)) for i, j in product(range(3, 10), ["entropy", "gini"]): model = tree.DecisionTreeClassifier(criterion=j, max_depth=i) model = model.fit(X_train_mod, y_train_mod) acc.append(accuracy_score(model.predict(X_test_mod), y_test)) del i, j data_modified = pd.DataFrame(data_modified) data_modified[target] = np.r_[y_train, y_test] # data_modified.to_csv("./data/housingPCA.csv") return data_modified, accuracy_score( clf.predict(X_test), y_test), max(acc), (len(dataset.columns) - len(data_modified.columns)) def booster(dataset, obj): #obj=["reg:linear","multi:softmax "] X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) clf = XGBClassifier(objective=obj, learning_rate=0.1, silent=1, alpha=10) clf.fit(X_train, y_train) return accuracy_score(clf.predict(X_test_mod), y_test_mod)
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10): global currLog global counter dataReader = DataReader("./data/" + get_last_file()[0]) if target == "": data = dataReader.data_generator() data.fillna(0, inplace=True) remove = get_similar_column(get_value_instruction(instruction), data) data = structured_preprocesser(data) y = data[remove] del data[remove] le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=49) first_classifier = tree.DecisionTreeClassifier() first_classifier.fit(X_train, y_train) first_classifier_acc = accuracy_score(first_classifier.predict(X_test), y_test) accuracy_scores = [first_classifier_acc] columns = [] datas = [] datas.append(dataset) columns.append([]) for i, x in product(range(3, 10), range(4, len(dataset.columns))): feature_model = RandomForestRegressor(random_state=1, max_depth=i) feature_model.fit(X_train, y_train) importances = feature_model.feature_importances_ indices = np.argsort(importances)[-x:] columns.append(dataset.columns[indices]) X_temp_train = X_train[dataset.columns[indices]] X_temp_test = X_test[dataset.columns[indices]] val = pd.DataFrame(np.r_[X_temp_train, X_temp_test]) val[target] = np.r_[y_train, y_test] datas.append(val) vr = tree.DecisionTreeClassifier() vr.fit(X_temp_train, y_train) accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test)) the_index = accuracy_scores.index(max(accuracy_scores)) return datas[the_index], accuracy_scores[0], max(accuracy_scores), list( columns[the_index])
def neural_network_query(self, instruction, mca_threshold=None, drop=None, preprocess=True, test_size=0.2, random_state=49, epochs=50, generate_plots=True, callback_mode='min', maximizer="val_loss", save_model=False, save_path=os.getcwd()): data = pd.read_csv(self.dataset) if preprocess: remove = get_similar_column(get_value_instruction(instruction), data) if (data[remove].dtype.name == 'object'): callback_mode = 'max' maximizer = "val_accuracy" self.classification_query_ann(instruction, mca_threshold=mca_threshold, preprocess=preprocess, test_size=test_size, random_state=random_state, epochs=epochs, generate_plots=generate_plots, callback_mode=callback_mode, maximizer=maximizer, save_model=save_model, save_path=save_path) else: self.regression_query_ann(instruction, mca_threshold=mca_threshold, preprocess=preprocess, test_size=test_size, random_state=random_state, epochs=epochs, generate_plots=generate_plots, callback_mode=callback_mode, maximizer=maximizer, drop=drop, save_model=save_model, save_path=save_path)
def initial_preprocesser(data, instruction, preprocess, mca_threshold): # Scans for object columns just in case we have a datetime column that # isn't detected object_columns = [ col for col, col_type in data.dtypes.iteritems() if col_type == 'object' ] # Handles dates without timestamps for col in object_columns: try: data[col] = pd.to_datetime(data[col], infer_datetime_format=True) except ValueError: pass # get target column target = get_similar_column(get_value_instruction(instruction), data) y = data[target] # remove rows where target is NaN data = data[y.notna()] y = y[y.notna()] del data[target] X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=49) data = { 'train': pd.concat([X_train], axis=1), 'test': pd.concat([X_test], axis=1) } # preprocess the dataset full_pipeline = None if preprocess: data, full_pipeline = structured_preprocesser(data, mca_threshold) else: data.fillna(0, inplace=True) y = {'train': y_train, 'test': y_test} return data, y, target, full_pipeline
def instruction_identifier(params): remove = get_similar_column(get_value_instruction(params['instruction']), params['data']) params['y'] = params['data'][remove] del params['data'][remove]
def dimensionality_reduc(instruction, dataset, arr=["RF", "PCA", "KPCA", "ICA"], inplace=False): global currLog global counter dataReader = DataReader(dataset) logger("loading dataset...") data = dataReader.data_generator() data.fillna(0, inplace=True) logger("getting most similar column from instruction...") target = get_similar_column(get_value_instruction(instruction), data) y = data[target] del data[target] le = preprocessing.LabelEncoder() y = le.fit_transform(y) data = structured_preprocesser(data) perms = [] overall_storage = [] finals = [] logger("generating dimensionality permutations...") for i in range(1, len(arr) + 1): for elem in list(permutations(arr, i)): perms.append(elem) logger("running each possible permutation...") logger("realigning tensors...") for path in perms: currSet = data for element in path: if element == "RF": data_mod, beg_acc, final_acc, col_removed = dimensionality_RF( instruction, currSet, target, y) elif element == "PCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_PCA( instruction, currSet, target, y) elif element == "KPCA": data_mod, beg_acc, final_acc, col_removed = dimensionality_KPCA( instruction, currSet, target, y) elif element == "ICA": data_mod, beg_acc, final_acc, col_removed = dimensionality_ICA( instruction, currSet, target, y) overall_storage.append( list([data_mod, beg_acc, final_acc, col_removed])) currSet = data_mod finals.append(overall_storage[len(overall_storage) - 1]) logger("Fetching Best Accuracies...") accs = [] print("") print("Baseline Accuracy: " + str(finals[0][1])) print("----------------------------") for i, element in product(range(len(finals)), finals): print("Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2])) if finals[0][1] < element[2]: accs.append( list([ "Permutation --> " + str(perms[i]) + " | Final Accuracy --> " + str(element[2]) ])) print("") print("Best Accuracies") print("----------------------------") for element in accs: print(element) if inplace: data.to_csv(dataset)