def svc_drop_miss(file, encoding): #Creates a new dataframe that is the original bcdata dataframe with all rows with missing values dropped bcdata = clean_data.drop_miss(file, encoding) #Subsets the above dataframe, split into the bare nuclei values and all other attribute values all_data = bcdata.drop(['Sample ID', 'Class label'], axis=1) class_data = bcdata['Class label'] #Split the subset data (bare nuclei and all other attributes) into training and testing sets all_train, all_test, class_train, class_test = train_test_split( all_data, class_data, test_size=0.20) #Instantiate the SVC model using linear kernel svclassifier = SVC(kernel='linear') #Fit SVC model to the training data svclassifier.fit(all_train, class_train) #Use the SVC model fit to the training data to predict the class label of the test set svm_y_pred = svclassifier.predict(all_test) #Create a list of tuples, where each tuple is a side-by-side comparison of the actual class label, compared with the predicted class #label of the SVM test_v_pred = zip(class_test.values, svm_y_pred) print(test_v_pred) #Determine the approximate accuracy of the SVM model fit to this data #Initialize a count that will track the number of times the model predicts an inaccurate class label count = 0 #Iterate through the list of zipped tuples for tup in test_v_pred: #If the first item of the tuple does not equal the second item, increment the count up by 1 if tup[0] != tup[1]: count += 1 return class_test.values, svm_y_pred, count
def color_map(file, encoding): drop_miss = clean_data.drop_miss(file, encoding) bcdata = drop_miss.drop(['Class label', 'Sample ID'], axis=1) #Creates a segmented line chart/color map. Each value across the x-axis represents one attribute plt.imshow(bcdata, aspect='auto') plt.colorbar() plt.xlabel('BC Data Attributes') plt.ylabel('Patient Record') plt.show()
def fill_w_avg(file, encoding): bcdata_no_miss = clean_data.drop_miss(file, encoding) bcdata_w_miss = clean_data.miss_to_nan(file, encoding) print(bcdata_w_miss['Bare nuclei'].value_counts()) mean = int(bcdata_no_miss['Bare nuclei'].mean()) bcdata = bcdata_w_miss.fillna(mean) print(zip(bcdata_w_miss['Bare nuclei'].value_counts().values, bcdata['Bare nuclei'].value_counts().values)) return bcdata
def pair_plot(file, encoding): bcdata = clean_data.drop_miss(file, encoding) bcdata = bcdata.drop(['Sample ID', 'Class label'], axis=1) print(bcdata) #Instantiate a pairplot for all data except for Sample ID sns.pairplot(bcdata, kind='reg', plot_kws={ 'line_kws': { 'color': 'red' }, 'scatter_kws': { 'alpha': 0.1 } }) plt.show()
def statsmod_lin_reg_predict(file, encoding): results = statsmod_lin_reg(file, encoding)[0] miss_to_nan = statsmod_lin_reg(file, encoding)[1] all_given = given_to_impute(file, encoding) known_to_predict = clean_data.subset_miss_data_x(file, encoding) #Predict the bare nuclei values (that are already known) using the values across the other attributes bn_known_preds = results.predict(all_given) #Create a scatter plot of the known bare nuclei values, and the bare nuclei values that were predicted by the model plt.scatter(clean_data.drop_miss(file, encoding)['Bare nuclei'], bn_known_preds, alpha=.1) #Label the x-axis of the scatter plot plt.xlabel('Actual Bare nuclei') #Label the y-axis of the scatter plot plt.ylabel('Predicted Bare nuclei') #Invoke the visualization of the scatter plot plt.show() #Predict the missing bare nuclei values using the known values of the other attributes bn_unknown_preds = results.predict(known_to_predict) print(bn_unknown_preds) #Create a subset of the previously created dataframe, with all missing values dropped, that includes only the records #with a malignant diagnosis mal_cases = miss_to_nan[miss_to_nan['Class label'] == 4] #Among the records with malignant diagnosis, create a scatter plot of the normal nucleoli values and the bare nuclei values #to determine if there appears to be any correlation plt.scatter(mal_cases['Normal nucleoli'], mal_cases['Bare nuclei'], alpha=.1) plt.xlabel('Normal nucleoli') plt.ylabel('Bare nuclei') plt.show()
def fill_w_mode(file, encoding): #Create a dataframe with all the rows with missing values dropped bcdata_no_miss = clean_data.drop_miss(file, encoding) #Create a dataframe with all missing values replaced with the NaN datatype bcdata_w_miss = clean_data.miss_to_nan(file, encoding) #Print out a series that displays each possible value of bare nuclei, and the frequency of each value print(type(bcdata_w_miss['Bare nuclei'].value_counts())) #Create a numpy array that lists every value of the bare nuclei attribute that has a mode frequency mode = bcdata_no_miss['Bare nuclei'].mode().values #Creates a new dataframe with all of the missing values filled in with the previously determined' #mode value bcdata = bcdata_w_miss.fillna(mode[0]) #Print out a list of tuples, each tuple representing each value of the bare nuclei attribute #containing the original frequency of that value and the adjusted frequency following replaced values print(zip(bcdata_w_miss['Bare nuclei'].value_counts().values, bcdata['Bare nuclei'].value_counts().values)) return bcdata