def preprocess_data_for_arm(): data = ic.separateImport() data = procd.fillData(data, fill_method='none', exclude_col=False) num_age_groups = 2 #min is 29 and max is 77 data["age"] = pd.cut(data["age"], num_age_groups, labels=[str(i) for i in range(num_age_groups)]) num_trestbps_groups = 3 #min is 94 and max is 200 data["trestbps"] = pd.cut( data["trestbps"], num_trestbps_groups, labels=[str(i) for i in range(num_trestbps_groups)]) num_chol_groups = 4 #min is 126 and max is 564 data["chol"] = pd.cut(data["chol"], num_chol_groups, labels=[str(i) for i in range(num_chol_groups)]) num_oldpeak_groups = 4 #min is 0 and max is 6.2 data["oldpeak"] = pd.cut( data["oldpeak"], num_oldpeak_groups, labels=[str(i) for i in range(num_oldpeak_groups)]) num_thalach_groups = 3 #min is 71 and max is 202 data["thalach"] = pd.cut( data["thalach"], num_thalach_groups, labels=[str(i) for i in range(num_thalach_groups)]) # attach string to all values except prediction column so that it becomes a unique item and also more readable for label in ic.LABELS[:-1]: data[label] = label + " " + data[label].astype(str) # no need to consider extent of heart disease - we just want the presence of heart disease data['prediction'] = [ "no heart disease" if x == 0 else "heart disease" for x in data['prediction'] ] if gender_bias: data = data.drop(columns=['sex']) return data
scoring=scorers, refit=refit_score, cv=skf, return_train_score=True, n_jobs=-1) grid_search.fit(trainX, trainY) # make the predictions y_pred = grid_search.predict(testX) # Prints out the optimal parameters for the criterion stated print('Best params for ', refit_score, ': ', grid_search.best_params_) # confusion matrix on the test data. results = pd.DataFrame(grid_search.cv_results_) results = results.sort_values(by='mean_test_precision_score', ascending=False) return results, grid_search.best_params_ # unit test code if __name__ == '__main__': data = ic.separateImport() data = procd.fillData(data, fill_method="median") # in above function, fill_method has 'median', 'mode', and 'mean' options to fill data with the median, mode or mean testX, testY, trainX, trainY = procd.createTrainingSet(data) res, best_params = gridSearchWrapper(testX, testY, trainX, trainY) randomForestClassify(testX, testY, trainX, trainY, best_params) print(res)
def perform_dbscan(): #import data table data = ic.separateImport() data = procd.fillData(data, fill_method='median') #several rows of cholestrol data show a value of 0, needs to be removed empty_indices = [] for i in range(data.shape[0]): if (data['chol'][i] == 0): empty_indices.append(i) data = data.drop(data.index[empty_indices]) #partition data into data and prediction X_data, Y_data = preprocessing.createFullSet(data) #scale the data X = (X_data - np.mean(X_data, axis=0)) / np.std(X_data, axis=0) #perform DBSCAN with eps = 2.4 and 7 min samples db = DBSCAN(eps=240 / 100, min_samples=7).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ #get number of clusters n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) #print cluster data unique, counts = np.unique(labels, return_counts=True) print("Number of data points in clusters") print(dict(zip(unique, counts))) #visualise data import visualise visualise.parallelVisualise(data, labels, ('black', 'blue', 'red', 'green'), 'dbscan') #display histogram of every cluster for every dimension zero_indices = np.where(labels == 0) one_indices = np.where(labels == 1) two_indices = np.where(labels == 2) #print histograms for i in range(1, 10, 1): curr_data = X_data[:, i] import matplotlib.pyplot as plt plt.figure() plt.hist(curr_data[zero_indices], bins='auto', color='blue') # arguments are passed to np.histogram plt.title(list(data)[i] + ' cluster Zero') plt.savefig('./figures/' + list(data)[i] + '-cluster Zero') plt.figure() plt.hist(curr_data[one_indices], bins='auto', color='orange') # arguments are passed to np.histogram plt.title(list(data)[i] + 'cluster One') plt.savefig('./figures/' + list(data)[i] + '-cluster One') plt.figure() plt.hist(curr_data[two_indices], bins='auto', color='green') # arguments are passed to np.histogram plt.title(list(data)[i] + ' cluster Two') plt.savefig('./figures/' + list(data)[i] + '-cluster Two') plt.figure() plt.hist(Y_data[zero_indices], bins='auto', color='blue') # arguments are passed to np.histogram plt.title('prediction' + ' cluster Zero') plt.savefig('./figures/' + 'prediction cluster Zero') plt.figure() plt.hist(Y_data[one_indices], bins='auto', color='orange') # arguments are passed to np.histogram plt.title('prediction' + ' cluster One') plt.savefig('./figures/' + 'prediction cluster One') plt.figure() plt.hist(Y_data[two_indices], bins='auto', color='green') plt.title('prediction' + ' cluster Two') plt.savefig('./figures/' + 'prediction cluster Two')
random_state=42, stratify=Y_data) # train_input = data.values # # X_data, Y_dataNum = train_input[:,:-1], train_input[:,-1] # Y_dataNum = [isPositive(x) for x in Y_dataNum] # seed = 123 # np.random.seed(seed) # idx = np.arange(X_data.shape[0]) # np.random.shuffle(idx) # Y_data = np.array(Y_dataNum) # X_data = X_data[idx] # Y_data = Y_data[idx] # m = 3* X_data.shape[0] // 10 # testX, testY = X_data[:m], Y_data[:m] # trainX, trainY = X_data[m:], Y_data[m:] return testX, testY, trainX, trainY def isPositive(x): if x > 0: return 1 else: return 0 if __name__ == '__main__': datadict = ic.separateImport() data = fillData(datadict, fill_method='median')
def reduceDimenTest(extramodelNames=''): FILL_METHODS = ["mean", "median", "mode", "none"] #space to search in where num_components refer to the dimensionality of the processed dataset NUM_COMPONENTS = [4, 8, 10] processedResults = defaultdict(lambda: []) for n_components in NUM_COMPONENTS: for filling in FILL_METHODS: print("**********************************now at ", filling, n_components) # in above function, fill_method has 'median', 'mode', and 'mean' options to fill data with the median, mode or mean data = ic.separateImport() data = procd.fillData(data, fill_method=filling) X_data, Y_data = preprocessing.createFullSet(data) X_data, pca, ss = preprocessing.performPCA(X_data, n_components) m = 3 * X_data.shape[0] // 10 trainX, testX, trainY, testY = train_test_split(X_data, Y_data, test_size=m, random_state=42, stratify=Y_data) #print(data) print("training set size: ", trainX.shape[0], " test set size: ", testX.shape[0]) #Sandbox your code here, before transfering it into your own python file predictions = [] methods = [] #min_sup = 0 #set it to be smth #associateRuleMiningPredictions = arm.generate_rules(min_sup) #print("Associate Rule Mining Predictions", associateRuleMiningPredictions) nnPredictions = nn.neuralNet(testX, testY, trainX, trainY, useTrainedModel=True, modelName=filling + str(n_components) + extramodelNames) #print("nnPredictions",type(nnPredictions),nnPredictions) predictions.append(nnPredictions) methods.append("nnPredictions") bayesPredictions = bayesian.naiveBayes(testX, testY, trainX, trainY) #print("bayesPredictions",type(bayesPredictions),bayesPredictions) predictions.append(bayesPredictions) methods.append("bayesPredictions") #gs is the grid search model that i use to find the best parameters for the svm. #It automatically uses k-fold cross validation to find the best parameters #we can call print(gs.best_params_) to determine what params were used for this model svmPredictions, clf = svm.svmPredict(testX, testY, trainX, trainY, filling + str(n_components) + extramodelNames, gridSearch=False) #print("SVMpredictions", type(svmPredictions), svmPredictions) predictions.append(svmPredictions) methods.append("SVMpredictions") #best hyperparams precalculated using grid search model to save time #res, best_params = dt.gridSearchWrapper(testX, testY, trainX, trainY) best_params = { 'n_estimators': 10, 'max_depth': 6, 'min_samples_split': 14 } randforestPred = dt.randomForestClassify(testX, testY, trainX, trainY, best_params) predictions.append(randforestPred) methods.append("randforest") # methods.append("Random forest") #ensemble method using a simple majority vote of all the classifiers. ensemblePred = [] for result in zip(*[item.tolist() for item in predictions]): ensemblePred.append(max(set(result), key=result.count)) predictions.append(ensemblePred) methods.append("Ensemble") #print("ensemblePred", ensemblePred) for prediction, labels in zip(predictions, methods): result = processResults(prediction, testY, filling, labels) result["n_components"] = n_components result["filling"] = filling processedResults[labels].append(result) generateGraphsSingle(processedResults, FILL_METHODS) generateGraphs(processedResults, FILL_METHODS)