예제 #1
0
def test_knn(k, train_data, train_labels, test_data):
    """ test_knn function

    Trains a KNN classifier with the given testing set then tests it
    on the testing data. Outputs as a CSV file.

    Args
    ----
    k : integer
        number of neighbors to use for KNN
    train_data : np.array
        training dataset
    train_labels : np.array
        training dataset labels
    test_data : np.array
        testing dataset

    Returns
    -------
    Tuple (np.array, np.array)
    """
    print("Final k:" + str(k))
    knn = KNN(k, train_data, train_labels)

    # print to CSV
    with open('predictions_digit_recognizer.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ImageId', 'Label'])
        for i in range(len(test_data)):
            data = test_data[i]
            guess = knn.classify(data)
            writer.writerow([str(i + 1), str(int(guess))])
예제 #2
0
from sklearn import datasets
from knn import KNN

data = datasets.load_iris()
trainingData = data['data']
trainingLabels = data['target']

classifier = KNN(k=5)

classifier.classify(trainingData, trainingLabels)
예제 #3
0
 
   # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B
   topics = ['baseball', 'christian', 'guns']
   confusion_matrix = {topic:{t:0 for t in topics} for topic in topics}
   
   print_log = False
   i = 0
   ytrue = []
   ypred = []
   for topic in topics:
     for doc in reader.test[topic]:
       ytrue.append(topic)
       # classifica os documentos de teste
       words = parser.process_sent(doc)
       query = tf_idf_calculator.generate_tf_vector(words)
       result = knn.classify(query)
       confusion_matrix[topic][result] += 1
       ypred.append(result)
       i += 1
       if print_log:
         print('')
         print(i)
         print(doc)
         print(words)
         print(query)
         print(result)
   
   # e imprime os resultados
   print('#'*40)
   s = '#'*10 + (' K=%d || dist=%s ' % (k, metric)) + '#'*10
   print(s)
예제 #4
0
class ModelEngineering:
    def __init__(self, pkg_dir):
        self.pkg_dir = pkg_dir
        self.frozen_graph_path = os.path.join(pkg_dir,
                                              'InceptionResNetV1-VGGFace2',
                                              '20180402-114759.pb')
        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)
        self.imgs_ph = None
        self.phase_train_ph = None
        self.embs_ph = None
        self.emb_size_ph = None
        self.initialized = False
        # we create an instance of Neighbours Classifier and fit the data.
        self.n_neighbors = 2
        # weight function used in prediction. Possible values: 'uniform', 'distance', [callable]
        self.weights = 'distance'
        # self.clf = neighbors.KNeighborsClassifier(self.n_neighbors, algorithm='ball_tree', weights=self.weights)
        self.knn = KNN()
        #self.gender_model= os.path.join(self.pkg_dir,'pre_trained_gn','gender_detection1.model')
        #self.gender = Gender(model=self.gender_model)

    def initialize(self):
        """
        Call load_model method and get input/output tensors
        :return: True, if everything goes well
        """
        self.imgs_ph, self.phase_train_ph, self.embs_ph, self.emb_size_ph = self.load_model(
            self.frozen_graph_path)
        return True

    def load_model(self, model, input_map=None):
        """
        Load a (frozen) Tensorflow model into memory.
        :param model: Could be either a directory containing the meta_file and ckpt_file or a model protobuf (.pb) file
        :param input_map: The input map
        :return: The place holders for input dataset, phase train, embeddings, and the embedding size
        """
        with self.graph.as_default():
            # Check if the model is a model directory (containing a metagraph and a checkpoint file)
            #  or if it is a protobuf file with a frozen graph
            model_exp = os.path.expanduser(model)
            if os.path.isfile(model_exp):
                print('Model filename: %s' % model_exp)
                with gfile.FastGFile(model_exp, 'rb') as f:
                    graph_def = tf.GraphDef()
                    graph_def.ParseFromString(f.read())
                    tf.import_graph_def(graph_def,
                                        input_map=input_map,
                                        name='')
            else:
                print('Model directory: %s' % model_exp)
                meta_file, ckpt_file = self.get_model_filenames(model_exp)

                print('Metagraph file: %s' % meta_file)
                print('Checkpoint file: %s' % ckpt_file)

                saver = tf.train.import_meta_graph(os.path.join(
                    model_exp, meta_file),
                                                   input_map=input_map)
                saver.restore(self.session, os.path.join(model_exp, ckpt_file))

            # Get input and output tensors
            imgs_ph = self.graph.get_tensor_by_name("input:0")
            embs_ph = self.graph.get_tensor_by_name("embeddings:0")
            phase_train_ph = self.graph.get_tensor_by_name("phase_train:0")
            emb_size = embs_ph.get_shape()[1]

        return imgs_ph, phase_train_ph, embs_ph, emb_size

    @staticmethod
    def get_model_filenames(model_dir):
        """
        Get the model file names.
        :param model_dir: The directory in which the saved checkpoints of the model exists.
        :return: The meta file name and the checkpoint file name
        """
        files = os.listdir(model_dir)
        meta_files = [s for s in files if s.endswith('.meta')]
        if len(meta_files) == 0:
            raise ValueError('No meta file found in the model directory (%s)' %
                             model_dir)
        elif len(meta_files) > 1:
            raise ValueError(
                'There should not be more than one meta file in the model directory ({})'
                .format(model_dir))
        meta_file = meta_files[0]
        ckpt = tf.train.get_checkpoint_state(model_dir)
        ckpt_file = ''
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_file = os.path.basename(ckpt.model_checkpoint_path)
            return meta_file, ckpt_file

        max_step = -1
        for f in files:
            step_str = re.match(r'(^model-[\w\- ]+.ckpt-(\d+))', f)
            if step_str is not None and len(step_str.groups()) >= 2:
                step = int(step_str.groups()[1])
                if step > max_step:
                    max_step = step
                    ckpt_file = step_str.groups()[0]
        return meta_file, ckpt_file

    def encode(self, images):
        """
        Run the forward pass to calculate embeddings.
        :param images: The input (4D) tensor
        :return: The 512-vector embeddings
        """
        if not self.initialized:
            self.initialized = self.initialize()
        feed_dict = {self.imgs_ph: images, self.phase_train_ph: False}
        emb_array = self.session.run(self.embs_ph, feed_dict=feed_dict)
        return emb_array

    def knn_fit(self, warehouse):
        """
        Fit the KNN classifier using the training data set
        :param warehouse:
        :return: None
        """
        emb_array = np.array([])
        uid_array = np.array([])
        for face in warehouse.get_faces():
            if emb_array.ndim == 1:
                emb_array = face.embedding
            else:
                emb_array = np.vstack((emb_array, face.embedding))
            uid_array = np.append(uid_array, face.uid)
        self.knn.fit(emb_array, uid_array)

    def knn_classify(self, query):
        """
        Supervised KNN
        :param query: the subject embedding
        :return: the UID of the subject
        """
        uid = self.knn.classify([query])
        # print('proba[index]', proba[index])
        # print('detect_uid', uid)
        return uid

    def knn_eval(self, warehouse):
        """
        Evaluate the KNN classifier on a test data set
        :return: the accuracy
        """
        emb_array = np.array([])
        uid_array = np.array([])
        for face in warehouse.get_faces():
            if emb_array.ndim == 1:
                emb_array = face.embedding
            else:
                emb_array = np.vstack((emb_array, face.embedding))
            uid_array = np.append(uid_array, face.uid)
        accuracy = self.knn.evaluate(emb_array, uid_array)
        return accuracy
예제 #5
0
def k_fold_cross_validation(training_data, training_labels):
    """ k_fold_cross_validation function

    Performs 3-fold cross validation on the training data to determine
    the best k-value for k-NN. Values tested are [1,5]

    Args
    ----
    training_data : np.array
        training data
    training_labels : np.array
        Associated training labels

    Returns
    -------
    integer
    """
    data = np.array_split(training_data, 3)
    labels = np.array_split(np.array(training_labels), 3)
    best_accuracy = -1.0
    best_k = -1
    best_confusion_matrix = None

    for k in range(1, 6):
        right = 0
        wrong = 0
        confusion_matrix = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for _ in range(10)]
        for n in range(3):  # 3-fold cross validation
            # split up data
            test_data = data[n]
            test_label = labels[n]
            if n == 0:
                train_data = np.concatenate((data[1], data[2]))
                train_labels = np.concatenate((labels[1], labels[2]))
            elif n == 1:
                train_data = np.concatenate((data[0], data[2]))
                train_labels = np.concatenate((labels[0], labels[2]))
            elif n == 2:
                train_data = np.concatenate((data[0], data[1]))
                train_labels = np.concatenate((labels[0], labels[1]))

            # train classifier
            knn = KNN(k, train_data, train_labels)

            # test classifier
            for d_index in range(len(test_data)):
                true_label = test_label[d_index]
                guess = knn.classify(test_data[d_index])
                confusion_matrix[int(true_label)][int(guess)] += 1
                if guess == true_label:
                    right += 1.0
                else:
                    wrong += 1.0

        # determine accuracy
        accuracy = right / (right + wrong)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
            best_confusion_matrix = confusion_matrix
        print("Accuracy for k=" + str(k) + ": " + str(accuracy))

    return best_k, best_confusion_matrix
예제 #6
0
파일: part1.py 프로젝트: massinat/ML
"""
Classification related to part 1.
KNN classification with K=1 and euclidean distance. Votes are not distance weighted.

@Author: Massimiliano Natale
"""

from knn import KNN
from resultHelper import ResultHelper
"""
Trigger the classification.
Create the output file and the chart to visualize the result.
"""
if __name__ == "__main__":
    knn = KNN("data/classification/trainingData.csv",
              "data/classification/testData.csv")

    classificationData = knn.buildClassificationData(
        lambda x: knn.classify(x[:-1], knn._trainingData[:, :-1], 1))

    # Save partial result to a file and draw the charts
    resultHelper = ResultHelper("part1.output.txt")

    resultHelper.write(classificationData)
    resultHelper.draw("KNN classification [not-weighted-distance] with K=1")
예제 #7
0
파일: PP1.py 프로젝트: Azizou/Python-ML
def main():
    
    #############################################
    # Set up the data as per the first Practicum
    #############################################
    
    spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',')
    fl = open('../input_data/spambase.names', 'r')
    lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace
    fl.close()
    
    colnames = [line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1')]
    colnames.append('spam')
    
    spam_df = pd.DataFrame(spam_values,columns=colnames)
    spam_df['spam']=2*spam_df['spam']-1
    
    # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame
    nsamples = spam_df.shape[0] 
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)
    
    # we want to make this reproducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples) 
    # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data
    np.random.shuffle(all_indices) 
    test_indices = all_indices[:ntest] # J: Get shuffled test indices first
    tune_indices = all_indices[ntest:(ntest+ntune)] # J: tune indices second
    train_indices = all_indices[(ntest+ntune):] # J: train indices (the majority) last
    
    # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through
    # DataFrame.ix. The second argument includes all columns, labels included.
    spam_train = spam_df.ix[train_indices,:]
    spam_tune = spam_df.ix[tune_indices,:]
    spam_test = spam_df.ix[test_indices,:]
    
    pd.save(spam_train, '../proc_data/training_data/spam_train.pdat')
    pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat')
    pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat')
    
    
    #######################################################################
    # See how features are sorted according to their Information Gain score
    #######################################################################
    
    # atestTree = DecisionTree(spam_train, 5, True)
    # print atestTree.__sortFeatures__(spam_train, spam_train.columns)
    
    ###############################################
    #  Training classifiers and saving them on disk
    ###############################################
    
    # Already trained those two, it took about 4 hours total. 
     
#    majVoteTree = DecTree.DecisionTree(spam_train, 5, False)
#    print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive."
#    majVoteTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    
#    IGTree = DecTree.DecisionTree(spam_train, 5, True)
#    print "Tuning an information gain classifier on all depths between 1 and 15 inclusive."
#    IGTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj")

    HectorsKNN = KNN(spam_train, spam_train['spam'], 5)
    print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:"
    HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1,42,2))
    print "Saving this classifier to disk."
    HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") 
    
    ###########################################
    # Playing with stored classifiers
    ###########################################
    
    # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10

#    print "Loading a decision tree trained with Majority Vote for depths 1 to 10..."
#    majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth)
#    classifications = majVoteTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    majVoteTree.classifyWithAllDepths(spam_test)
#    print "\n===========================================================\n"
#    
#    # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10
#    
#    print "Loading a decision tree trained with Information Gain for depths 1 to 10..."
#    IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth)
#    classifications = IGTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    IGTree.classifyWithAllDepths(spam_test)
    
    # Part 3: Hector's KNN-classifier
    
    print "Reloading Hector's classifier from disk:"
    HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj")
    print "According to the tuning set, the optimal K for this classifier is: " + str(HectorsKNN.k) + "."
    classifications = HectorsKNN.classify(spam_test)
    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    print 'For this value of K, the error on the test set was %0.3f' % testErrorRate
    print "We will now test all different hyper-parameters found during tuning on the test data:"
    HectorsKNN.classifyWithAllK(spam_test)
    
    # Part 4: Weighted Features KNN
    
    print "Exiting..."
예제 #8
0
def main():  # TODO: test with user input, confirm input with TAs
    logging.basicConfig(stream=sys.stdout,
                        level=logging.DEBUG if DEBUG else logging.INFO)
    trainingData = "data/train.txt"
    testingData = "data/test.txt"
    if len(sys.argv) > 1:
        trainingData = sys.argv[1]
        testingData = sys.argv[2]
        print(
            "Taking %s as training data, and %s as testing data" %
            trainingData, testingData)
    col_names = [
        "index", "ri", "na", "mg", "al", "si", "k", "ca", "ba", "fe", "type"
    ]
    train_df = pd.read_csv(trainingData, names=col_names)
    test_df = pd.read_csv(testingData, names=col_names)

    # Data stats
    print("Data characteristics:")
    print("No. of attributes: ", len(train_df.iloc[0]))
    print("No. of features usable for classifcation: ",
          len(train_df.iloc[0]) - 2)
    print("Size of training data", len(train_df))
    print("Size of testing data", len(test_df))
    print("No. of unique classes: ", 7)
    print("Unique classes represented in training data: ",
          train_df['type'].unique())
    print("\t(Histogram of classes in figure 1)")
    # plot class histogram
    train_df.hist('type', alpha=.5, bins=7)
    plt.title("Figure 1: Class (glass type) Histogram")
    # /Data stats

    print(
        "\n******************************** Running KNN classifer ********************************"
    )
    # Run KNN for k = 1, 3, 5, 7 and L1 & L2 norms on training (leave one out) and test sets
    for k in (1, 3, 5, 7):
        for order in (1, 2):  # order of the norm
            print("Running KNN of order %d with L-%d norm" % (k, order))
            knn = KNN(train_df.iloc[:, -1],
                      train_df.iloc[:, 1:-1],
                      k,
                      distance=lambda a, b: np.linalg.norm(a - b, ord=order),
                      normalize_data=True)
            for title, filename, data, leave_one_out in (("TEST",
                                                          "knn_%d_l%d_test" %
                                                          (k, order), test_df,
                                                          False),
                                                         ("TRAIN",
                                                          "knn_%d_l%d_train" %
                                                          (k, order), train_df,
                                                          True)):
                with open_output_file(filename) as f:
                    f.write("#index,predicted_class,actual_class\n")
                    total = 0
                    correct = 0
                    for row in data.values:
                        predicted = knn.classify(row[1:-1], leave_one_out)
                        actual = row[-1]
                        f.write("%d,%d,%d\n" % (row[0], predicted, actual))
                        total += 1
                        if actual == predicted:
                            correct += 1
                    accstr = "Accuracy on %s data: %f" % (
                        title, float(correct) / total)
                    f.write(accstr + "\n")
                    print(accstr)

    print(
        "\n******************************** Running gaussian naive baye's classifer ********************************"
    )
    gb = GaussianBayesClassifier(sigma_depends_on_class=True, verbose=DEBUG)
    print("Training classifier...")
    gb.train(train_df.iloc[:, 1:])
    print("Training complete")
    # print("Params:")
    for title, filename, data in (("test", "bayes_test.txt", test_df),
                                  ("train", "bayes_train", train_df)):
        with open_output_file(filename) as f:
            print("Running on %sing data" % title)
            f.write("#index,predicted_class,actual_class\n")
            categories = gb.classify(data.iloc[:, 1:-1])
            total = 0
            correct = 0
            for idx, predicted_category in enumerate(categories):
                actual = data.iloc[idx, -1]
                if predicted_category == actual:
                    correct += 1
                total += 1
                f.write("%d,%d,%d\n" %
                        (data.iloc[idx, 0], predicted_category, actual))
            accstr = "Accuracy on %s data: %f" % (title,
                                                  float(correct) / total)
            f.write(accstr + "\n")
            print(accstr)

    print(
        "\n******************************** Running gaussian naive baye's classifer (with sigma independent of class) ********************************"
    )
    gb = GaussianBayesClassifier(sigma_depends_on_class=False, verbose=DEBUG)
    print("Training classifier...")
    gb.train(train_df.iloc[:, 1:])
    print("Training complete")
    # print("Params:")
    for title, filename, data in (("test", "bayes_test_sigmaindependent.txt",
                                   test_df),
                                  ("train", "bayes_train_sigmaindependent",
                                   train_df)):
        with open_output_file(filename) as f:
            print("Running on %sing data" % title)
            f.write("#index,predicted_class,actual_class\n")
            categories = gb.classify(data.iloc[:, 1:-1])
            total = 0
            correct = 0
            for idx, predicted_category in enumerate(categories):
                actual = data.iloc[idx, -1]
                if predicted_category == actual:
                    correct += 1
                total += 1
                f.write("%d,%d,%d\n" %
                        (data.iloc[idx, 0], predicted_category, actual))
            accstr = "Accuracy on %s data: %f" % (title,
                                                  float(correct) / total)
            f.write(accstr + "\n")
            print(accstr)

    print(
        "\n******************************** Showing class histogram ********************************"
    )
    plt.show()
예제 #9
0
def main():

    #############################################
    # Set up the data as per the first Practicum
    #############################################

    spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',')
    fl = open('../input_data/spambase.names', 'r')
    lines = [line.strip()
             for line in fl]  # J : strip from beginning and ending whitespace
    fl.close()

    colnames = [
        line.partition(':')[0] for line in lines
        if not (len(line) == 0 or line[0] == '|' or line[0] == '1')
    ]
    colnames.append('spam')

    spam_df = pd.DataFrame(spam_values, columns=colnames)
    spam_df['spam'] = 2 * spam_df['spam'] - 1

    # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame
    nsamples = spam_df.shape[0]
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)

    # we want to make this reproducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples)
    # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data
    np.random.shuffle(all_indices)
    test_indices = all_indices[:ntest]  # J: Get shuffled test indices first
    tune_indices = all_indices[ntest:(ntest + ntune)]  # J: tune indices second
    train_indices = all_indices[(
        ntest + ntune):]  # J: train indices (the majority) last

    # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through
    # DataFrame.ix. The second argument includes all columns, labels included.
    spam_train = spam_df.ix[train_indices, :]
    spam_tune = spam_df.ix[tune_indices, :]
    spam_test = spam_df.ix[test_indices, :]

    pd.save(spam_train, '../proc_data/training_data/spam_train.pdat')
    pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat')
    pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat')

    #######################################################################
    # See how features are sorted according to their Information Gain score
    #######################################################################

    # atestTree = DecisionTree(spam_train, 5, True)
    # print atestTree.__sortFeatures__(spam_train, spam_train.columns)

    ###############################################
    #  Training classifiers and saving them on disk
    ###############################################

    # Already trained those two, it took about 4 hours total.

    #    majVoteTree = DecTree.DecisionTree(spam_train, 5, False)
    #    print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive."
    #    majVoteTree.tune(spam_tune,1, 15)
    #    print "Saving this classifier to disk."
    #    majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
    #
    #    IGTree = DecTree.DecisionTree(spam_train, 5, True)
    #    print "Tuning an information gain classifier on all depths between 1 and 15 inclusive."
    #    IGTree.tune(spam_tune,1, 15)
    #    print "Saving this classifier to disk."
    #    IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj")

    HectorsKNN = KNN(spam_train, spam_train['spam'], 5)
    print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:"
    HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1, 42, 2))
    print "Saving this classifier to disk."
    HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj")

    ###########################################
    # Playing with stored classifiers
    ###########################################

    # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10

    #    print "Loading a decision tree trained with Majority Vote for depths 1 to 10..."
    #    majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
    #    print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth)
    #    classifications = majVoteTree.classify(spam_test)
    #    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    #    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
    #    print "We will now test all different hyper-parameters found during tuning on the test data:"
    #    majVoteTree.classifyWithAllDepths(spam_test)
    #    print "\n===========================================================\n"
    #
    #    # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10
    #
    #    print "Loading a decision tree trained with Information Gain for depths 1 to 10..."
    #    IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj")
    #    print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth)
    #    classifications = IGTree.classify(spam_test)
    #    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    #    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
    #    print "We will now test all different hyper-parameters found during tuning on the test data:"
    #    IGTree.classifyWithAllDepths(spam_test)

    # Part 3: Hector's KNN-classifier

    print "Reloading Hector's classifier from disk:"
    HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj")
    print "According to the tuning set, the optimal K for this classifier is: " + str(
        HectorsKNN.k) + "."
    classifications = HectorsKNN.classify(spam_test)
    testErrorRate = np.mean((spam_test['spam'].values * classifications) < 0)
    print 'For this value of K, the error on the test set was %0.3f' % testErrorRate
    print "We will now test all different hyper-parameters found during tuning on the test data:"
    HectorsKNN.classifyWithAllK(spam_test)

    # Part 4: Weighted Features KNN

    print "Exiting..."
예제 #10
0
class KNNTestCase(unittest.TestCase):
    """
    Test cases for the KNN implementation
    """
    def __init__(self, *args, **kwargs):
        unittest.TestCase.__init__(self, *args, **kwargs)
        self.knn = KNN(k=5)
        self.train_data = np.array([[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2],
                                    [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2],
                                    [7.0, 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5],
                                    [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4.0,
                                                           1.3]])
        self.train_label = np.array([0, 0, 0, 0, 1, 1, 1, 1])
        self.test_data = np.array([[5.0, 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4],
                                   [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3],
                                   [6.3, 3.3, 6.0, 2.5], [5.8, 2.7, 5.1, 1.9]])
        self.test_label = np.array([0, 0, 1, 1, 2, 2])

    def test_fit(self):
        """
        The return value of the function should be equal to the number of the classes in the data set
        :return: None
        """
        num_classes = self.knn.fit(self.test_data, self.test_label)
        condition = num_classes == 3
        self.assertEqual(condition, True)

    def test_compute_distance(self):
        """
        The distance between the two input vectors should be a floating point value inside the zero to one interval
        :return: None
        """
        sample0 = self.train_data[0]
        sample1 = self.train_data[1]
        distance = self.knn.compute_distance(sample0, sample1)
        condition = 0.0 <= distance <= 1.0
        self.assertEqual(condition, True)

    def test_get_neighbours(self):
        """
        The returned neighbours should be a list of tuples, each of which contains the label and the distance
        :return: None
        """
        self.knn.fit(self.train_data, self.train_label)
        query = self.test_data[0]
        neighbours = self.knn.get_neighbors(query)
        condition0 = len(neighbours) == self.knn.k
        condition1 = len(neighbours[0]) == 2
        condition = condition0 and condition1
        self.assertEqual(condition, True)

    def test_classify(self):
        """
        The classified label of the normal sample should correspond to its ground truth label
        and for the anomaly sample that does not belong to any of the training classes it should be equal to -1
        :return: None
        """
        self.knn.fit(self.train_data, self.train_label)
        normal_data = self.test_data[0]
        normal_label = self.test_label[0]
        anomaly_data = self.test_data[-1]
        normal_pred = self.knn.classify(normal_data)
        anomaly_pred = self.knn.classify(anomaly_data)
        condition0 = normal_pred == normal_label
        condition1 = anomaly_pred == -1
        condition = condition0 and condition1
        self.assertEqual(condition, True)

    def test_evaluate(self):
        """
        The returned accuracy should be equal to 1.0 in the case where the training and test set are the same
        and in the case where the training set and test set are different it should be in the interval zero to one
        :return: None
        """
        self.knn.fit(self.train_data, self.train_label)
        accuracy_perfect = self.knn.evaluate(self.train_data, self.train_label)
        condition0 = accuracy_perfect == 1.0
        accuracy_imperfect = self.knn.evaluate(self.test_data, self.test_label)
        condition1 = 0.0 < accuracy_imperfect < 1.0
        condition = condition0 and condition1
        self.assertEqual(condition, True)