def main(): # read in cancer data from text file with open('cancer_data.txt') as file: cancer_data = file.readlines() # modifies cancer data to put it into proper format cancer_data = np.array([i.split() for i in cancer_data]) class_vector = cancer_data[:,1] class_vector.shape = (cancer_data.shape[0],1) cancer_data = np.delete(cancer_data.astype('float16'), (0, 1), 1) cancer_data = np.hstack((cancer_data, class_vector)) # prints 5 fold n_validator with cancer data and synthetic data print(nn.n_validator(cancer_data, 5, nn.NNclassifier)) print(nn.n_validator(nn.synthetic_data(), 5, nn.NNclassifier))
def run_trials_k(data, p, clasifier, k, metric, trials): ''' Runs n_validator with the specified parameters a given number of trials for a specified k value and returns the average performance for that k ''' bpf_sum = 0 i = 0 # Add the performances for each trial while(i < trials): bpf = nn.n_validator(data,p,clasifier,k,metric) bpf_sum += bpf i += 1 # Return the average return bpf_sum/trials
def best_k(data_set, trials, dist_type): ''' Determines the best k value for labeling test data given a specific distance type ''' k = 1 val = .1 last_val = 0 # test all k values until success rate is optimized while(val > last_val): last_val = val # run nn_validator for a number of trials for each k x = 0 for i in range(trials): x += nn.n_validator(data_set, 15, nn.KNNclassifier, k, dist_type) val = x/trials # only test odd values of k k += 2 return (k-2), val
def main(): '''Main function that calculates the accuracy of the classifier using the breast cancer data set and the synthetic data set''' #read in the file and separate by commas inFile = np.genfromtxt('wdbc.data.txt', delimiter = " ") #delete patient IDs inFile = np.delete(inFile, 0 , 1) syntheticData = nn.createMultivariate(300) BCDict = {} synDict = {} for i in range(3): if i == 0: metric = 'euclidean' elif i == 1: metric = 'cityblock' elif i == 2: metric = 'cosine' k = 1 bestBCAvg = 0 bestSynAvg = 0 while k <= 15: bcDataOutput = [] syntheticOutput = [] for j in range(100): bcDataTest = nn.n_validator(inFile, 5, nn.KNNclassifier, k, metric) #append accuracy values for the given k value bcDataOutput.append(bcDataTest) syntheticValidator = nn.n_validator(syntheticData, 5, nn.KNNclassifier, k, metric) syntheticOutput.append(syntheticValidator) curBCAvg = sum(bcDataOutput) / len(bcDataOutput) curSynAvg = sum(syntheticOutput) / len(syntheticOutput) #replace the value for the best average and best k value if curBCAvg > bestBCAvg: bestBCAvg = curBCAvg kBestBC = k if curSynAvg > bestSynAvg: bestSynAvg = curSynAvg kBestSyn = k k += 2 #store the accuracy value for each metric BCDict[metric] = (kBestBC, bestBCAvg) synDict[metric] = (kBestSyn, bestSynAvg) newBC = {} newSyn = {} #get the best k and distance metric combo for keys in BCDict: newBC[keys] = BCDict[keys][1] maxBCDist = max(newBC, key = newBC.get) bestKValBC = BCDict[maxBCDist][0] for keys in synDict: newSyn[keys] = synDict[keys][1] maxSynDist = max(newSyn, key = newSyn.get) bestKValSyn = synDict[maxSynDist][0] print("Best k values using euclidean distance metric \n" " Breast Cancer Data Set: {} with an accuracy of {} \n" " Synthetic Data Set: {} with an accuracy of {}".format( BCDict['euclidean'][0], BCDict['euclidean'][1], synDict['euclidean'][0], synDict['euclidean'][1])) print("Best k values using cityblock distance metric \n" " Breast Cancer Data Set: {} with an accuracy of {} \n" " Synthetic Data Set: {} with an accuracy of {}".format( BCDict['cityblock'][0], BCDict['cityblock'][1], synDict['cityblock'][0], synDict['cityblock'][1])) print("Best k values using cosine distance metric \n" " Breast Cancer Data Set: {} with an accuracy of {} \n" " Synthetic Data Set: {} with an accuracy of {}".format( BCDict['cosine'][0], BCDict['cosine'][1], synDict['cosine'][0], synDict['cosine'][1])) print("Best metric for {} is {} with a k value of {} \n" "and an accuracy of {}".format( 'Breast Cancer', maxBCDist, bestKValBC, BCDict[maxBCDist][1])) print("Best metric for {} is {} with a k value of {} \n" "and an accuracy of {}".format( 'Synthetic Data', maxSynDist, bestKValSyn, synDict[maxSynDist][1]))