示例#1
0
def svc_drop_miss(file, encoding):

    #Creates a new dataframe that is the original bcdata dataframe with all rows with missing values dropped
    bcdata = clean_data.drop_miss(file, encoding)

    #Subsets the above dataframe, split into the bare nuclei values and all other attribute values
    all_data = bcdata.drop(['Sample ID', 'Class label'], axis=1)
    class_data = bcdata['Class label']

    #Split the subset data (bare nuclei and all other attributes) into training and testing sets
    all_train, all_test, class_train, class_test = train_test_split(
        all_data, class_data, test_size=0.20)

    #Instantiate the SVC model using linear kernel
    svclassifier = SVC(kernel='linear')
    #Fit SVC model to the training data
    svclassifier.fit(all_train, class_train)
    #Use the SVC model fit to the training data to predict the class label of the test set
    svm_y_pred = svclassifier.predict(all_test)

    #Create a list of tuples, where each tuple is a side-by-side comparison of the actual class label, compared with the predicted class
    #label of the SVM
    test_v_pred = zip(class_test.values, svm_y_pred)
    print(test_v_pred)

    #Determine the approximate accuracy of the SVM model fit to this data
    #Initialize a count that will track the number of times the model predicts an inaccurate class label
    count = 0
    #Iterate through the list of zipped tuples
    for tup in test_v_pred:
        #If the first item of the tuple does not equal the second item, increment the count up by 1
        if tup[0] != tup[1]:
            count += 1

    return class_test.values, svm_y_pred, count
示例#2
0
def color_map(file, encoding):

    drop_miss = clean_data.drop_miss(file, encoding)
    bcdata = drop_miss.drop(['Class label', 'Sample ID'], axis=1)

    #Creates a segmented line chart/color map. Each value across the x-axis represents one attribute
    plt.imshow(bcdata, aspect='auto')
    plt.colorbar()
    plt.xlabel('BC Data Attributes')
    plt.ylabel('Patient Record')
    plt.show()
示例#3
0
def fill_w_avg(file, encoding):

	bcdata_no_miss = clean_data.drop_miss(file, encoding)
	bcdata_w_miss = clean_data.miss_to_nan(file, encoding)
	print(bcdata_w_miss['Bare nuclei'].value_counts())

	mean = int(bcdata_no_miss['Bare nuclei'].mean())

	bcdata = bcdata_w_miss.fillna(mean)
	print(zip(bcdata_w_miss['Bare nuclei'].value_counts().values, bcdata['Bare nuclei'].value_counts().values))

	return bcdata
示例#4
0
def pair_plot(file, encoding):

    bcdata = clean_data.drop_miss(file, encoding)
    bcdata = bcdata.drop(['Sample ID', 'Class label'], axis=1)
    print(bcdata)

    #Instantiate a pairplot for all data except for Sample ID
    sns.pairplot(bcdata,
                 kind='reg',
                 plot_kws={
                     'line_kws': {
                         'color': 'red'
                     },
                     'scatter_kws': {
                         'alpha': 0.1
                     }
                 })
    plt.show()
示例#5
0
def statsmod_lin_reg_predict(file, encoding):

    results = statsmod_lin_reg(file, encoding)[0]
    miss_to_nan = statsmod_lin_reg(file, encoding)[1]
    all_given = given_to_impute(file, encoding)
    known_to_predict = clean_data.subset_miss_data_x(file, encoding)

    #Predict the bare nuclei values (that are already known) using the values across the other attributes
    bn_known_preds = results.predict(all_given)

    #Create a scatter plot of the known bare nuclei values, and the bare nuclei values that were predicted by the model
    plt.scatter(clean_data.drop_miss(file, encoding)['Bare nuclei'],
                bn_known_preds,
                alpha=.1)

    #Label the x-axis of the scatter plot
    plt.xlabel('Actual Bare nuclei')

    #Label the y-axis of the scatter plot
    plt.ylabel('Predicted Bare nuclei')

    #Invoke the visualization of the scatter plot
    plt.show()

    #Predict the missing bare nuclei values using the known values of the other attributes
    bn_unknown_preds = results.predict(known_to_predict)
    print(bn_unknown_preds)

    #Create a subset of the previously created dataframe, with all missing values dropped, that includes only the records
    #with a malignant diagnosis
    mal_cases = miss_to_nan[miss_to_nan['Class label'] == 4]

    #Among the records with malignant diagnosis, create a scatter plot of the normal nucleoli values and the bare nuclei values
    #to determine if there appears to be any correlation
    plt.scatter(mal_cases['Normal nucleoli'],
                mal_cases['Bare nuclei'],
                alpha=.1)
    plt.xlabel('Normal nucleoli')
    plt.ylabel('Bare nuclei')
    plt.show()
示例#6
0
def fill_w_mode(file, encoding):

	#Create a dataframe with all the rows with missing values dropped
	bcdata_no_miss = clean_data.drop_miss(file, encoding)

	#Create a dataframe with all missing values replaced with the NaN datatype
	bcdata_w_miss = clean_data.miss_to_nan(file, encoding)

	#Print out a series that displays each possible value of bare nuclei, and the frequency of each value
	print(type(bcdata_w_miss['Bare nuclei'].value_counts()))

	#Create a numpy array that lists every value of the bare nuclei attribute that has a mode frequency
	mode = bcdata_no_miss['Bare nuclei'].mode().values

	#Creates a new dataframe with all of the missing values filled in with the previously determined'
	#mode value
	bcdata = bcdata_w_miss.fillna(mode[0])

	#Print out a list of tuples, each tuple representing each value of the bare nuclei attribute
	#containing the original frequency of that value and the adjusted frequency following replaced values
	print(zip(bcdata_w_miss['Bare nuclei'].value_counts().values, bcdata['Bare nuclei'].value_counts().values))

	return bcdata