Exemplo n.º 1
0
def main():
    
    # read in cancer data from text file
    with open('cancer_data.txt') as file:
        cancer_data = file.readlines()
        
    # modifies cancer data to put it into proper format
    cancer_data = np.array([i.split() for i in cancer_data])
    class_vector = cancer_data[:,1]
    class_vector.shape = (cancer_data.shape[0],1)
    cancer_data = np.delete(cancer_data.astype('float16'), (0, 1), 1)
    cancer_data = np.hstack((cancer_data, class_vector))
    
    # prints 5 fold n_validator with cancer data and synthetic data
    print(nn.n_validator(cancer_data, 5, nn.NNclassifier))
    print(nn.n_validator(nn.synthetic_data(), 5, nn.NNclassifier))
Exemplo n.º 2
0
def run_trials_k(data, p, clasifier, k, metric, trials):
    ''' Runs n_validator with the specified parameters a given number of 
    trials for a specified k value and returns the average performance for 
    that k
    '''
    bpf_sum = 0 
    i = 0
    # Add the performances for each trial
    while(i < trials):
        bpf = nn.n_validator(data,p,clasifier,k,metric)
        bpf_sum += bpf
        i += 1
    # Return the average
    return bpf_sum/trials
Exemplo n.º 3
0
def best_k(data_set, trials, dist_type):
    '''
    Determines the best k value for labeling test data given a 
    specific distance type
    '''
    
    k = 1
    val = .1
    last_val = 0
    
    # test all k values until success rate is optimized
    while(val > last_val):
        last_val = val
        
        # run nn_validator for a number of trials for each k
        x = 0
        for i in range(trials):
            x += nn.n_validator(data_set, 15, nn.KNNclassifier, k, dist_type)
        val = x/trials
        
        # only test odd values of k
        k += 2
        
    return (k-2), val
def main():
    '''Main function that calculates the accuracy of the 
    classifier using the breast cancer data set and the 
    synthetic data set'''
    
    #read in the file and separate by commas
    inFile = np.genfromtxt('wdbc.data.txt', delimiter = " ")
    #delete patient IDs
    inFile = np.delete(inFile, 0 , 1)
    syntheticData = nn.createMultivariate(300)
    BCDict = {}
    synDict = {}
    for i in range(3):
        if i == 0:
            metric = 'euclidean'
        elif i == 1:
            metric = 'cityblock'
        elif i == 2:
            metric = 'cosine'
        k = 1
        bestBCAvg = 0
        bestSynAvg = 0
        while k <= 15:
            bcDataOutput = []
            syntheticOutput = []
            for j in range(100):
                bcDataTest = nn.n_validator(inFile, 5, nn.KNNclassifier, k,
                    metric)
                #append accuracy values for the given k value
                bcDataOutput.append(bcDataTest)
                syntheticValidator = nn.n_validator(syntheticData, 5, 
                nn.KNNclassifier, k, metric)
                syntheticOutput.append(syntheticValidator)  
            curBCAvg = sum(bcDataOutput) / len(bcDataOutput)
            curSynAvg = sum(syntheticOutput) / len(syntheticOutput)
            #replace the value for the best average and best k value
            if curBCAvg > bestBCAvg:
                bestBCAvg = curBCAvg
                kBestBC = k
            if curSynAvg > bestSynAvg:
                bestSynAvg = curSynAvg
                kBestSyn = k
            k += 2
        #store the accuracy value for each metric
        BCDict[metric] = (kBestBC, bestBCAvg)
        synDict[metric] = (kBestSyn, bestSynAvg)
    
    newBC = {}
    newSyn = {}
    #get the best k and distance metric combo
    for keys in BCDict:
        newBC[keys] = BCDict[keys][1]
    maxBCDist = max(newBC, key = newBC.get)
    bestKValBC = BCDict[maxBCDist][0]
    for keys in synDict:
        newSyn[keys] = synDict[keys][1]
    maxSynDist = max(newSyn, key = newSyn.get)
    bestKValSyn = synDict[maxSynDist][0]
        
    print("Best k values using euclidean distance metric \n"
    "     Breast Cancer Data Set: {} with an accuracy of {} \n"
    "     Synthetic Data Set: {} with an accuracy of {}".format(
    BCDict['euclidean'][0], BCDict['euclidean'][1], 
    synDict['euclidean'][0], synDict['euclidean'][1]))
    print("Best k values using cityblock distance metric \n"
    "     Breast Cancer Data Set: {} with an accuracy of {} \n"
    "     Synthetic Data Set: {} with an accuracy of {}".format(
    BCDict['cityblock'][0], BCDict['cityblock'][1], 
    synDict['cityblock'][0], synDict['cityblock'][1]))
    print("Best k values using cosine distance metric \n"
    "     Breast Cancer Data Set: {} with an accuracy of {} \n"
    "     Synthetic Data Set: {} with an accuracy of {}".format(
    BCDict['cosine'][0], BCDict['cosine'][1], 
    synDict['cosine'][0], synDict['cosine'][1]))
    
    print("Best metric for {} is {} with a k value of {} \n"
    "and an accuracy of {}".format(
    'Breast Cancer', maxBCDist, bestKValBC, BCDict[maxBCDist][1])) 
    print("Best metric for {} is {} with a k value of {} \n"  
    "and an accuracy of {}".format(
    'Synthetic Data', maxSynDist, bestKValSyn, synDict[maxSynDist][1]))