Пример #1
0
def preprocess_data_for_arm():

    data = ic.separateImport()
    data = procd.fillData(data, fill_method='none', exclude_col=False)

    num_age_groups = 2  #min is 29 and max is 77
    data["age"] = pd.cut(data["age"],
                         num_age_groups,
                         labels=[str(i) for i in range(num_age_groups)])

    num_trestbps_groups = 3  #min is 94 and max is 200
    data["trestbps"] = pd.cut(
        data["trestbps"],
        num_trestbps_groups,
        labels=[str(i) for i in range(num_trestbps_groups)])

    num_chol_groups = 4  #min is 126 and max is 564
    data["chol"] = pd.cut(data["chol"],
                          num_chol_groups,
                          labels=[str(i) for i in range(num_chol_groups)])

    num_oldpeak_groups = 4  #min is 0 and max is 6.2
    data["oldpeak"] = pd.cut(
        data["oldpeak"],
        num_oldpeak_groups,
        labels=[str(i) for i in range(num_oldpeak_groups)])

    num_thalach_groups = 3  #min is 71 and max is 202
    data["thalach"] = pd.cut(
        data["thalach"],
        num_thalach_groups,
        labels=[str(i) for i in range(num_thalach_groups)])

    # attach string to all values except prediction column so that it becomes a unique item and also more readable
    for label in ic.LABELS[:-1]:
        data[label] = label + " " + data[label].astype(str)

    # no need to consider extent of heart disease - we just want the presence of heart disease
    data['prediction'] = [
        "no heart disease" if x == 0 else "heart disease"
        for x in data['prediction']
    ]
    if gender_bias:
        data = data.drop(columns=['sex'])
    return data
Пример #2
0
                               scoring=scorers,
                               refit=refit_score,
                               cv=skf,
                               return_train_score=True,
                               n_jobs=-1)
    grid_search.fit(trainX, trainY)

    # make the predictions
    y_pred = grid_search.predict(testX)
    # Prints out the optimal parameters for the criterion stated
    print('Best params for ', refit_score, ': ', grid_search.best_params_)

    # confusion matrix on the test data.
    results = pd.DataFrame(grid_search.cv_results_)
    results = results.sort_values(by='mean_test_precision_score',
                                  ascending=False)
    return results, grid_search.best_params_


# unit test code
if __name__ == '__main__':
    data = ic.separateImport()
    data = procd.fillData(data, fill_method="median")
    # in above function, fill_method has 'median', 'mode', and 'mean' options to fill data with the median, mode or mean

    testX, testY, trainX, trainY = procd.createTrainingSet(data)
    res, best_params = gridSearchWrapper(testX, testY, trainX, trainY)

    randomForestClassify(testX, testY, trainX, trainY, best_params)
    print(res)
Пример #3
0
def perform_dbscan():
    #import data table
    data = ic.separateImport()
    data = procd.fillData(data, fill_method='median')

    #several rows of cholestrol data show a value of 0, needs to be removed
    empty_indices = []
    for i in range(data.shape[0]):
        if (data['chol'][i] == 0):
            empty_indices.append(i)
    data = data.drop(data.index[empty_indices])

    #partition data into data and prediction
    X_data, Y_data = preprocessing.createFullSet(data)

    #scale the data
    X = (X_data - np.mean(X_data, axis=0)) / np.std(X_data, axis=0)

    #perform DBSCAN with eps = 2.4 and 7 min samples
    db = DBSCAN(eps=240 / 100, min_samples=7).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    #get number of clusters
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    #print cluster data
    unique, counts = np.unique(labels, return_counts=True)
    print("Number of data points in clusters")
    print(dict(zip(unique, counts)))

    #visualise data
    import visualise
    visualise.parallelVisualise(data, labels,
                                ('black', 'blue', 'red', 'green'), 'dbscan')

    #display histogram of every cluster for every dimension
    zero_indices = np.where(labels == 0)
    one_indices = np.where(labels == 1)
    two_indices = np.where(labels == 2)

    #print histograms
    for i in range(1, 10, 1):
        curr_data = X_data[:, i]
        import matplotlib.pyplot as plt
        plt.figure()
        plt.hist(curr_data[zero_indices], bins='auto',
                 color='blue')  # arguments are passed to np.histogram
        plt.title(list(data)[i] + ' cluster Zero')
        plt.savefig('./figures/' + list(data)[i] + '-cluster Zero')
        plt.figure()
        plt.hist(curr_data[one_indices], bins='auto',
                 color='orange')  # arguments are passed to np.histogram
        plt.title(list(data)[i] + 'cluster One')
        plt.savefig('./figures/' + list(data)[i] + '-cluster One')
        plt.figure()
        plt.hist(curr_data[two_indices], bins='auto',
                 color='green')  # arguments are passed to np.histogram
        plt.title(list(data)[i] + ' cluster Two')
        plt.savefig('./figures/' + list(data)[i] + '-cluster Two')
    plt.figure()
    plt.hist(Y_data[zero_indices], bins='auto',
             color='blue')  # arguments are passed to np.histogram
    plt.title('prediction' + ' cluster Zero')
    plt.savefig('./figures/' + 'prediction cluster Zero')
    plt.figure()
    plt.hist(Y_data[one_indices], bins='auto',
             color='orange')  # arguments are passed to np.histogram
    plt.title('prediction' + ' cluster One')
    plt.savefig('./figures/' + 'prediction cluster One')
    plt.figure()
    plt.hist(Y_data[two_indices], bins='auto', color='green')
    plt.title('prediction' + ' cluster Two')
    plt.savefig('./figures/' + 'prediction cluster Two')
Пример #4
0
                                                    random_state=42,
                                                    stratify=Y_data)
    # train_input = data.values
    #
    # X_data, Y_dataNum = train_input[:,:-1], train_input[:,-1]
    # Y_dataNum = [isPositive(x) for x in Y_dataNum]
    # seed = 123
    # np.random.seed(seed)
    # idx = np.arange(X_data.shape[0])
    # np.random.shuffle(idx)
    # Y_data = np.array(Y_dataNum)
    # X_data = X_data[idx]
    # Y_data = Y_data[idx]
    # m = 3* X_data.shape[0] // 10
    # testX, testY =  X_data[:m], Y_data[:m]
    # trainX, trainY = X_data[m:], Y_data[m:]

    return testX, testY, trainX, trainY


def isPositive(x):
    if x > 0:
        return 1
    else:
        return 0


if __name__ == '__main__':
    datadict = ic.separateImport()
    data = fillData(datadict, fill_method='median')
Пример #5
0
def reduceDimenTest(extramodelNames=''):
    FILL_METHODS = ["mean", "median", "mode", "none"]

    #space to search in where num_components refer to the dimensionality of the processed dataset
    NUM_COMPONENTS = [4, 8, 10]
    processedResults = defaultdict(lambda: [])

    for n_components in NUM_COMPONENTS:
        for filling in FILL_METHODS:
            print("**********************************now at ", filling,
                  n_components)

            # in above function, fill_method has 'median', 'mode', and 'mean' options to fill data with the median, mode or mean
            data = ic.separateImport()
            data = procd.fillData(data, fill_method=filling)
            X_data, Y_data = preprocessing.createFullSet(data)
            X_data, pca, ss = preprocessing.performPCA(X_data, n_components)
            m = 3 * X_data.shape[0] // 10
            trainX, testX, trainY, testY = train_test_split(X_data,
                                                            Y_data,
                                                            test_size=m,
                                                            random_state=42,
                                                            stratify=Y_data)

            #print(data)

            print("training set size: ", trainX.shape[0], " test set size: ",
                  testX.shape[0])
            #Sandbox your code here, before transfering it into your own python file
            predictions = []
            methods = []
            #min_sup = 0 #set it to be smth
            #associateRuleMiningPredictions = arm.generate_rules(min_sup)
            #print("Associate Rule Mining Predictions", associateRuleMiningPredictions)

            nnPredictions = nn.neuralNet(testX,
                                         testY,
                                         trainX,
                                         trainY,
                                         useTrainedModel=True,
                                         modelName=filling +
                                         str(n_components) + extramodelNames)
            #print("nnPredictions",type(nnPredictions),nnPredictions)
            predictions.append(nnPredictions)
            methods.append("nnPredictions")

            bayesPredictions = bayesian.naiveBayes(testX, testY, trainX,
                                                   trainY)
            #print("bayesPredictions",type(bayesPredictions),bayesPredictions)
            predictions.append(bayesPredictions)
            methods.append("bayesPredictions")

            #gs is the grid search model that i use to find the best parameters for the svm.
            #It automatically uses k-fold cross validation to find the best parameters
            #we can call print(gs.best_params_) to determine what params were used for this model
            svmPredictions, clf = svm.svmPredict(testX,
                                                 testY,
                                                 trainX,
                                                 trainY,
                                                 filling + str(n_components) +
                                                 extramodelNames,
                                                 gridSearch=False)
            #print("SVMpredictions", type(svmPredictions), svmPredictions)
            predictions.append(svmPredictions)
            methods.append("SVMpredictions")

            #best hyperparams precalculated using grid search model to save time
            #res, best_params = dt.gridSearchWrapper(testX, testY, trainX, trainY)
            best_params = {
                'n_estimators': 10,
                'max_depth': 6,
                'min_samples_split': 14
            }
            randforestPred = dt.randomForestClassify(testX, testY, trainX,
                                                     trainY, best_params)
            predictions.append(randforestPred)
            methods.append("randforest")
            # methods.append("Random forest")

            #ensemble method using a simple majority vote of all the classifiers.
            ensemblePred = []

            for result in zip(*[item.tolist() for item in predictions]):
                ensemblePred.append(max(set(result), key=result.count))

            predictions.append(ensemblePred)
            methods.append("Ensemble")
            #print("ensemblePred", ensemblePred)

            for prediction, labels in zip(predictions, methods):
                result = processResults(prediction, testY, filling, labels)
                result["n_components"] = n_components
                result["filling"] = filling
                processedResults[labels].append(result)
    generateGraphsSingle(processedResults, FILL_METHODS)
    generateGraphs(processedResults, FILL_METHODS)