def run():
    #in order to process the data, run the data processing file
    df_list = [
        "abalone", "car", "segmentation", "machine", "forestfires", "wine"
    ]
    # df_list = ["machine", "forestfires", "wine"]
    df_class_num = [3, 4, 7, 1, 1, 1]
    # df_class_num = [1, 1, 1]

    results_file = open("./results/results_rbn.txt", "a+")
    for i in range(len(df_list)):

        data_array = Data_Processing_Lists("./processed",
                                           df_list[i] + "_processed")
        data_array.file_array = data_array.file_array[:]
        class_list = []
        for j in range(
                df_class_num[i]
        ):  #makes an array of integers the same lenght as the number of classes each data set has
            class_list.append(j)
        #data.append(data_array)
        #classes.append(class_list)

        data_array.slicer(5)
        test_data = data_array.file_array.pop(0)
        data_array.join_array()
        training_data = data_array.file_array
        toy = copy.deepcopy(data_array)
        toy.slicer(4)
        medoids = toy.file_array.pop(0)
        toy.join_array()
        training_data_toy = toy.file_array

        knn = [
            edited_nn(13, training_data)[:1000],
            k_means(100, training_data),
            k_medoids(medoids, training_data_toy)
        ]
        # knn = [k_means(int(len(training_data)/4), training_data), k_medoids(medoids, training_data_toy)]
        algo_name = ['edited knn', 'kmeans', "kmedoids"]
        # algo_name = ['kmeans', "kmedoids"]
        algo_idx = 0
        for centers in knn:
            for k in range(1):

                # def __init__(self, data, output, gaussian_function_type, centers):
                print("class list", class_list)
                print("size rbf\n", len(centers))
                print(df_list[i])

                rbn = RBN(training_data, class_list, 1, centers)
                rbn.train()

                guesses = rbn.classify(test_data)
                losses = Loss_Functions(guesses)

                if (len(class_list) == 1):  #regression
                    print_str = "MSE for " + str(df_list[i]) + " fold: " + str(
                        k) + " \n" + algo_name[algo_idx]
                    print(print_str)
                    print(losses.mse())

                    results_file.write("\n" + print_str)
                    results_file.write("\nMSE: " + str(losses.mse()) + "\n")

                else:  #classification
                    losses.confusion_matrix_generator()
                    print_str = "Fscore for " + str(
                        df_list[i]) + " fold: " + str(
                            k) + "\n" + algo_name[algo_idx]
                    print(print_str)
                    print(losses.f_score())

                    results_file.write("\n" + print_str)
                    results_file.write("\nF-score: " + str(losses.f_score()) +
                                       "\n")
            algo_idx += 1
    results_file.close()
Пример #2
0
    [1, 1],
    [2, 1],
    [3, 1],
    [1, 2],
    [4, 2],
    [5, 2],
    [3, 3],
    [2, 4],
    [3, 4],
    [5, 4]
]

# print(dataset)

# 使用10个数据,分3类,最多迭代20次
C, labels = k_means(dataset_10, 3, 20)
print(C)
print(labels)

# 绘图代码
############################################################
colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm']
for i in range(len(C)):
    coo_X = []  # x坐标列表
    coo_Y = []  # y坐标列表
    for j in range(len(C[i])):
        coo_X.append(C[i][j][0])
        coo_Y.append(C[i][j][1])
    plt.scatter(coo_X, coo_Y, marker='o',
                color=colValue[i % len(colValue)], label=i)
############################################################
def cross_validation(folds, k, dataframes, algorithm_name, evaluation_metric):
#dataframes = [db_name, [section1,..,sectionN]]
        #confusion matrix
    guessed_classes = []


    if algorithm_name == 'k-nn':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])

            guessed_classes+=k_nearest_neighbor(k,training_data, test_data)
            dataframes[1].append(test_data)

            
    if algorithm_name == 'k-nn-regression':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])

            guessed_classes+=k_nearest_neighbor_regression(k,training_data, test_data)
            dataframes[1].append(test_data)        



    if algorithm_name == 'edited':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])
            training_data = edited_k_nearest(k, training_data)
            guessed_classes += k_nearest_neighbor(k,training_data, test_data)
            dataframes[1].append(test_data)
       
            
    
    if algorithm_name == 'condensed':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])
            training_data = condensed_k_nearest(k, training_data)
            guessed_classes += k_nearest_neighbor(k,training_data, test_data)
            dataframes[1].append(test_data)

            

    

    if algorithm_name == 'k-means':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])
            
            #training_data = slicer(4, training_data) # 1/4 of data for this algorithm
            #training_data = shuffle_pd_df(training_data)
            k_means_k = int(len(training_data)/4)
            training_data = k_means(k_means_k, training_data)
            guessed_classes+=(k_nearest_neighbor_regression(k,training_data, test_data))

            dataframes[1].append(test_data)
            
    if algorithm_name == 'edited-k-means':
        print("New Data Set")
        for i in range(folds):
            test_data = dataframes[1].pop(i)
            training_data = concat_df(dataframes[1])
            training_data = edited_k_nearest(k, training_data)
            guessed_classes += k_nearest_neighbor(k,training_data, test_data)
            dataframes[1].append(test_data)

    if algorithm_name == 'k-medoids':
        print("New Data Set")
        for i in range(folds):
            #pop off the data for testing
            test_data = dataframes[1].pop(i)

            #concatinate all the training data
            training_data = concat_df(dataframes[1])
            
            # training_data = shuffle_pd_df(training_data)
            training_data = slicer(4, training_data) # 1/4 of data for this algorithm

            #set medoids to 1/4 data
            medoids = training_data.pop(0)

            #set training data to leftover 3/4 data
            training_data = concat_df(training_data)

            #run PAM-NN to generate medoid set
            returned_medoids = k_medoids(medoids, training_data)
            
            #run k-NN with medoids
            guessed_classes += k_nearest_neighbor_regression(k,returned_medoids, test_data)

            dataframes[1].append(test_data)

    if algorithm_name == 'edited-k-medoids':
        print("New Data Set")
        for i in range(folds):
            #pop off the data for testing
            test_data = dataframes[1].pop(i)

            #concatinate all the training data
            training_data = concat_df(dataframes[1])

            #generate medoid data set by running edited-kNN
            medoids = edited_k_nearest(k, training_data)

            #remove medoid data points from training data
            for index, row in medoids.iterrows():
                training_data = training_data.drop(index)
            print("size of training data", len(training_data))
            print("size of medoids data", len(medoids))

            #run PAM-NN to generate medoids with edited-kNN data set as initial guesses
            returned_medoids = k_medoids(medoids, training_data)

            #classify test data
            guessed_classes += k_nearest_neighbor(k,returned_medoids, test_data)

            #store test data with guessed classes
            dataframes[1].append(test_data)

    #-----------------
    #evaluation metrics for the algorithm's guessed_classes 
    #-----------------
    
    if evaluation_metric == 'fscore': #only for classification
        confusion = {} #confusion matrix
        
        #for each class, initialize the confusion matrix with zeros for that class
        unique_classes = concat_df(dataframes[1])['0'].unique().tolist()
        for class_name in unique_classes:
            confusion.update({class_name:{'TP':0,'FP':0,'TN':0,'FN':0}})#class_name is the key for each classes confusion matrix
            #confusion{class:{TP:0,FP:0,TN:0,FN:0}}

        #for each class
        for class_name in unique_classes:
            #for each data point guessed in that class
            for result in guessed_classes: #result[0] is actual class and result[1] is our guess
                if class_name == result[1] and class_name == result[0]: #guess is accurate with what the class actually was
                    value = 'TP'
                if class_name == result[1] and class_name != result[0]: #guessed that a record was part of a class and it wasn't
                    value = 'FP'
                if class_name != result[1] and class_name == result[0]: #guessed that a record was not part of a class and it was
                    value = 'FN'
                if class_name != result[1] and class_name != result[0]: #guess is accurate that the record did not belong to a class
                    value = 'TN'
                confusion[class_name][value] += 1 #increment that classes TP/FP/TN/FN count accordingly
        
        #calculate our class independent accuracy
        correct = 0
        total = 0
        for result in guessed_classes:
            if(result[0]==result[1]):
                correct+=1
            total+=1
        accuracy = correct/total

        
        num_of_classes = len(confusion)
        average_cm = {'TP':0,'FP':0,'TN':0,'FN':0}  #average confusion matrix over every class
        print(confusion)

        count = 0
        precision = 0
        recall=0
        f1=0
        for class1, matrix in confusion.items():
            TP = matrix['TP']
            TN = matrix['TN']
            FP = matrix['FP']
            FN = matrix['FN']
            if((TP+FP) != 0):
                precision += TP/(TP+FP)
                ptemp = TP/(TP+FP)
            else:
                ptemp = 0
            if((TP+FN) != 0):
                recall += TP/(TP+FN)
                rtemp = TP/(TP+FN)
            else:
                rtemp = 0
            if((ptemp+rtemp)!=0):
                f1 += 2*ptemp*rtemp/(ptemp+rtemp)
            count+=1
        precision = precision/count
        recall = recall/count
        f1 = f1/count
        
        #f1 = 2*precision*recall/(precision+recall)

        metrics = {'F1': f1, 'Precision':precision, 'Recall':recall, 'Accuracy': accuracy}
        return average_cm, metrics

    if evaluation_metric == 'regression':
        #For datasets: machine, forestfirest, wine
        print("regression")
        sum_of_error = 0.0 
        for result in guessed_classes:
            print(result)
            sum_of_error += (result[0]-result[1])**2
        
        mean_square_error = sum_of_error/len(guessed_classes)

        return mean_square_error
Пример #4
0
    print("Number of Centroids : ", int(num_centroids))

    # get input data
    data = np.genfromtxt(input_path,
                         delimiter='\t',
                         skip_header=1,
                         dtype=float)

    # shuffle data
    X = []
    np.random.shuffle(data)
    for row in data:
        X.append(row[1:])

    # Run k_means and return the final set of centroids and the Cluster Assingments
    centroids, cluster_assignment = k_means(np.array(X), int(num_centroids))

    # Initialize array with 0's equal to the number of centroids. Used arrays for SSE Calculations
    cluster_assignment_array = []
    sse_cluster_assignment_array = []
    for i in range(int(num_centroids)):
        cluster_assignment_array.append([])
        sse_cluster_assignment_array.append([])

    # Get data ready for SSE
    for clusterID, row in zip(cluster_assignment, data):
        cluster_assignment_array[clusterID].append(row[0])
        sse_cluster_assignment_array[clusterID].append(row[1:])

    # Calculate the Sum of Squared Error
    sse = calc_sse(centroids, sse_cluster_assignment_array)