예제 #1
0
 def __init__(self, 
     #Set an error value 
     error: float,
     #Set the k number of neighbors
     k: int,
     #Pass in the data type 
     data_type: str,
     #Set the categorical features 
     categorical_features: list,
     #Set a list of features in the regression data set 
     regression_data_set: bool,
     #Set the alpha float 
     alpha:float,
     #Set the beta float 
     beta:float,
     #set the width value 
     h:float,
     #Set the dimensionality 
     d:int):
     # initialize a knn object
     self.knn = kNN.kNN(k, data_type, categorical_features, regression_data_set, alpha, beta, h, d)
     self.nn = kNN.kNN(1, data_type, categorical_features, regression_data_set, alpha, beta, h, d)
     # error threshhold for regression classification
     self.error = error
     # store if this data set is a regression data set (True) or not (False)
     self.regression_data_set = regression_data_set
     self.results = Results.Results()
예제 #2
0
 def __init__(
         self,
         # number of neighbors in knn
         kNeighbors: int,
         # number of clusters
         kValue: int,
         # data to cluster
         dataSet: np.ndarray,
         # 'mixed', 'categorical', or 'real' data set
         data_type: str,
         # list of integers representing categorical feature column indices
         categorical_features: list,
         # True if the data set is a regression data set
         regression_data_set: bool,
         # weight for real value in distance metric
         alpha: int,
         # weight for categorical value in distance metric
         beta: int,
         # bin width for gaussian kernel smoother
         h: float,
         # dimensionality of data set (# features)
         d: int,
         name: str,
         Testdata: np.ndarray):
     # create a Nearest Neighbor object to single nearest neighbor to input data point
     self.nn = kNN.kNN(1, data_type, [], regression_data_set, alpha, beta,
                       h, d)
     self.knn = kNN.kNN(kNeighbors, data_type, [], regression_data_set,
                        alpha, beta, h, d)
     self.categorical_features = []
     self.itermax = 5
     self.kValue = kValue
     #Convert the data such that no categorical values are in the data set
     for j in range(len(Testdata)):
         Testdata[j] = self.ConvertData(Testdata[j], name)
     #Set the test set value
     self.Testdata = Testdata
     #Convert the data such that no categorical values are in the data set
     for j in range(len(dataSet)):
         dataSet[j] = self.ConvertData(dataSet[j], name)
     #Set the data set value
     self.dataSet = dataSet
     #Set the dimensionality
     self.d = d
     #If the data set is machine
     if name == "machine":
         self.dataSet = self.dataSet[:, 2:]
         self.Testdata = self.Testdata[:, 2:]
         self.d = d - 2
     # dimensionality of data set
     # save which features are real as well by deleting categorical indices from a new list
     real_features = list(range(d))
     #Remove all of the categorical values
     for i in categorical_features:
         real_features.remove(i)
     #SEt the real features object variable
     self.real_features = real_features
예제 #3
0
def addToGraph(L1_concept, depth, language='en'):
    lang_set = {"en", "sp"}
    L2_name = min(lang_set - {language})  # get the L2 name

    L1_vertex = g.get_vertex(L1_concept, all_vertices)
    if not isinstance(L1_vertex, Vertex):
        sys.exit("Fatal Error: instance is not a vertex")

    # Translate L1 concept to get L2 concept and add to L2 evoking list
    if language == 'en':
        L2_concept = (en2sp(L1_concept[0]), L1_concept[1])
        updateConcepts(L2_concept, evoked_sp, evoking_sp)
    else:
        L2_concept = (sp2en(L1_concept[0]), L1_concept[1])
        updateConcepts(L2_concept, evoked_en, evoking_en)

    # BASE CASE (no more levels to be added)
    if depth == 0:
        if language == 'en':
            L2_concept = (en2sp(L1_concept[0]), L1_concept[1])
        else:
            L2_concept = (sp2en(L1_concept[0]), L1_concept[1])
        addTranslation(L2_concept, L1_vertex, L2_name)
        return
    else:
        if language == 'en':
            # List of tuples of (name, distance_to_parent_concept)
            L1_evocations = kNN(L1_concept[0], evoked_en, evoking_en, k)
            L2_evocations = kNN(L2_concept[0], evoked_sp, evoking_sp, k)
        else:
            L1_evocations = kNN(L1_concept[0], evoked_sp, evoking_sp, k)
            L2_evocations = kNN(L2_concept[0], evoked_en, evoking_en, k)

        # Add L1 and L2 list of most similar words to graph
        addEvocations(L1_vertex, L1_evocations, language)
        L2_vertex = addTranslation(L2_concept, L1_vertex, L2_name)
        addEvocations(L2_vertex, L2_evocations, L2_name)

        for evocation in L1_evocations:
            # Add evocations of given concept to concept_set
            if language == 'en':
                updateConcepts(evocation, evoked_en, evoking_en)
                addToGraph(evocation, depth-1, 'en')
            else: 
                updateConcepts(evocation, evoked_sp, evoking_sp)
                addToGraph(evocation, depth-1, 'sp')

        for evocation in L2_evocations:
            if language == 'en':
                updateConcepts(evocation, evoked_sp, evoking_sp)
                addToGraph(evocation, depth-1, 'sp')
            else:
                updateConcepts(evocation, evoked_en, evoking_en)
                addToGraph(evocation, depth-1, 'en')
예제 #4
0
def handWritingTest():
    # 定义标签
    labels = []
    # 读取文件目录
    fileList = os.listdir('trainingDigits')
    size = len(fileList)
    hwMat = np.zeros((size, 1024))
    for i in range(size):
        # 处理标签
        fileName = fileList[i]
        labels.append(fileName.split('_')[0])
        hwMat[i, :] = img2vector('trainingDigits/%s' % fileName)
    testFileList = os.listdir('testDigits')
    m = len(testFileList)
    errorCount = 0.0
    for i in range(m):
        fileName = testFileList[i]
        label = fileName.split('_')[0]
        inputData = img2vector('testDigits/%s' % fileName)
        result = knn.kNN(inputData, hwMat, labels, 3)
        if label is result:
            print("its Ok")
        else:
            errorCount += 1
            print("keep Going")
    print(errorCount / m)
예제 #5
0
def main():
    
    ckNN = kNN( enableLoggingTime=True )
    #load data
    ckNN.loadData( fileName = '../data/creditcard.csv', feaRowEnd = 284808)
    # feature scaling
    ckNN.scaleFeature( minm=0, maxm=1 )
    #Feature reduction (loading previously saved data)
    feaSelecData = ckNN.loadVariables( 'featureExtractAll' )
    ckNN.selectImportantFeatures( feaSelecData['selectedIndices'] )
    
    ckNN.kSweep = [1, 3, 5]
    # do double cross
    ValScoreList, \
    ValScoreStdList, \
    TestScoreList, \
    bestParamList, \
    allData = ckNN.doubleCrossValidate(ckNN.featureNumpy, 
                                             ckNN.ClassNumpy,
                                             nFoldOuter=5, nFoldInner=4,
                                             scoring=scoring.MCC,
                                             isStratified = True,
                                             fileName='kNN/kNNData')
    print "Validation Avg. Score for outer folds with best param: \n"
    print ValScoreList
    print "Validation Score Std. for outer folds with best param: \n"
    print ValScoreStdList
    print "Test Avg. Score for outer folds with best param: \n"
    print TestScoreList
    print "Best Param list for outer folds: \n"
    print bestParamList
예제 #6
0
def predict():
    #symbImgs = symbols.getSymbolImages(binaryImages, symbolCenters, 28, 28)
    #Flatten using knn first
    knn = kNN()
    print "knn"
    flat_syms = knn.flatten_symbols(symbImgs)
    print "flat"
    binary_symbs = preprocessor.binarize(np.array(flat_syms))
    print "bin"
    knn.load_MNIST()
    print "loaded"
    m_data = flatten(knn.mnist_data)
    m_labels = knn.mnist_labels
    print "Dat"
    f_symbs = flatten(binary_symbs)
    fix_data_len(f_symbs)
    print "fb"
    LR = linear_model.LogisticRegression()
    inst = LR.fit(m_data[4800:], m_labels[4800:])
    results = []
    for i in range(1, len(f_symbs) + 1):
        if i % 1000 == 0:
            print i
        results.append(inst.predict(f_symbs[i - 1:i]))
    return results
예제 #7
0
def main():
    # ================================================
    # Load pre-trained model and remove higher level layers
    # ================================================
    base_model = VGG19(weights='imagenet')
    model = Model(input=base_model.input,
                  output=base_model.get_layer('block4_pool').output)

    source_path = "db"
    source_paths = np.array(
        list(
            filter(lambda path: os.path.splitext(path)[1] in ['.jpg', '.jpeg'],
                   np.array(os.listdir(source_path)))))

    # ================================================
    # Read images and convert them to feature vectors
    # ================================================

    imgs, filename_heads, X = [], [], []
    for f in [sys.argv[1]]:
        # Process filename
        filename = os.path.splitext(f)  # filename in directory
        head, ext = filename[0], filename[1]
        if ext.lower() not in [".jpg", ".jpeg"]:
            continue

        # Read image file
        img = image.load_img(f, target_size=(224, 224))  # load
        imgs.append(np.array(img))  # image
        filename_heads.append(head)  # filename head

        # Pre-process for model input
        img = image.img_to_array(img)  # convert to array
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        features = model.predict(img).flatten()  # features
        X.append(features)  # append feature extractor

    X = np.array(X)  # feature vectors
    imgs = np.array(imgs)  # images

    # ===========================
    # Find k-nearest images to each image
    # ===========================
    n_neighbours = 3
    pre_X = np.load('feature_matrix.npy')
    knn = kNN()  # kNN model
    knn.compile(n_neighbors=n_neighbours, algorithm="brute", metric="cosine")
    knn.fit(pre_X)

    n_imgs = len(imgs)
    ypixels, xpixels = imgs[0].shape[0], imgs[0].shape[1]
    for ind_query in range(n_imgs):
        # Find top-k closest image feature vectors to each vector
        distances, indices = knn.predict(np.array([X[ind_query]]))
        distances = distances.flatten()
        indices = indices.flatten()
        indices, distances = find_topk_unique(indices, distances, n_neighbours)
        print(json.dumps(source_paths[indices].flatten().tolist()))
        print(distances)
예제 #8
0
def main():
    kernel = c.COSINE
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel

    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])
    # start training

    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for r in (0.15, 0.1):
        clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0], r=r)
        te_pred = clf.predict(te_data[0], r=r)

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
예제 #9
0
파일: Problem3.py 프로젝트: siyuzhan/ML
def partC(k = 5):
    train_titles, test_titles = load_titles()
    knn = kNN.kNN(10, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.test")
    results = [knn.choose_label(i) for i in knn.test_data]
    return results
예제 #10
0
def partC(k=5):
    train_titles, test_titles = load_titles()
    knn = kNN.kNN(10, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.test")
    results = [knn.choose_label(i) for i in knn.test_data]
    return results
예제 #11
0
def predict_missing_elements(data, k=5):
    import kNN
    # first, find the columns that have missing values
    isNaN = np.isnan(data)
    colHasNan = np.any(isNaN, axis=0)
    # print(colHasNan)

    # now, do column-wise interpretation

    model = kNN.kNN(k, method='regression')

    for i, col in enumerate(colHasNan):
        if (col):
            # print('predicting col', i, data[:,i], isNaN[:,i])

            # delete rows that have NaN in column i
            train = np.delete(data, isNaN[:, i], axis=0)

            # fit a kNN model to the data
            # delete column i - the column to predict
            model.fit(np.delete(train, i, axis=1), train[:, i])

            # this is inefficient but much easier to code
            # predict the value for all elements of column i
            p = model.predict(np.delete(data, i, axis=1))
            # if data[row,col] is None {data[row,col] = p} else {data[row,col] = data[row,col]}
            data[:, i] = np.where(isNaN[:, i], p, data[:, i])

            # print('predicted:', data[:,i])

    return data
예제 #12
0
def partB():
    knn = kNN.kNN(1, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.test")
    correct_pred, predict, true = [0 for i in range(5)
                                   ], [0 for i in range(5)
                                       ], [0 for i in range(5)]
    precision, recall = [0 for i in range(5)], [0 for i in range(5)]
    genre_centroid = kNN.get_genre_centroid(knn.training_data,
                                            knn.training_labels)
    for i in range(len(knn.test_data)):
        sim_max, label = -1, -1
        for j in range(len(genre_centroid)):
            sim = kNN.cosine_similarity(knn.test_data[i], genre_centroid[j])
            if (sim > sim_max):
                sim_max = sim
                label = j
        if (label == int(knn.test_labels[i])):
            correct_pred[label] += 1
        predict[label] += 1
        true[int(numpy.asscalar(knn.test_labels[i]))] += 1
    accuracy = float(sum(correct_pred)) / len(knn.test_data)
    for i in range(5):
        if predict[i] != 0:
            precision[i] = float(correct_pred[i]) / predict[i]
        if true[i] != 0:
            recall[i] = float(correct_pred[i]) / true[i]
    return accuracy, precision, recall
예제 #13
0
파일: Problem3.py 프로젝트: siyuzhan/ML
def partB():
    knn = kNN.kNN(1, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.test")
    correct_pred, predict, true = [0 for i in range(5)], [0 for i in range(5)], [0 for i in range(5)]
    precision, recall = [0 for i in range(5)], [0 for i in range(5)]
    genre_centroid = kNN.get_genre_centroid(knn.training_data, knn.training_labels)
    for i in range(len(knn.test_data)):
        sim_max, label = -1, -1
        for j in range(len(genre_centroid)):
            sim = kNN.cosine_similarity(knn.test_data[i], genre_centroid[j])
            if (sim > sim_max):
                sim_max = sim
                label = j
        if (label == int(knn.test_labels[i])):
            correct_pred[label] += 1
        predict[label] += 1
        true[int(numpy.asscalar(knn.test_labels[i]))] +=1
    accuracy = float(sum(correct_pred))/len(knn.test_data)
    for i in range(5):
        if predict[i] != 0:
            precision[i] = float(correct_pred[i])/predict[i]
        if true[i] != 0:
            recall[i] = float(correct_pred[i])/true[i]
    return accuracy, precision, recall
예제 #14
0
def kNearest():
    print('\nPart 3: k-Nearest Neighbor')
    k_test = [1, 5, 11, 15, 21]
    for k in k_test:
        clf = kNN(X_train, y_train, k=k)
        preds = clf.predict(X_test)
        print('k =', k, ';', 'accuracy on test set =', np.mean(preds==y_test))
예제 #15
0
def classify(trainSet, trainLabels, testSet,k=50):
	
	predictedLabels = zeros(testSet.shape[0])	
     
     # Use kNN algorithm in order to predict the labels
	for i in range(testSet.shape[0]):
			predictedLabels[i] = kNN(k, trainSet, trainLabels, testSet[i])

	return predictedLabels
예제 #16
0
def task1(param_set, runs=20):
    """
    This function produces basic results for the k-NN algorithm
    
    Input:
    param_set: arr or list -- the parameters of the k-Nearest Neighbour algorithm
    runs: int -- number of runs to average 'best' k over
    
    Returns 4 lists containing the train errors, std and test errors, std of all parameters
    """
    
    # save the means and stds of all parameters
    train_means = []
    train_stds = []
    test_means = []
    test_stds = []
    
    # loop through the parameters, running multiple iterations per param
    for k in tqdm(param_set, 'k'):
        
        # save errors of this parameter
        all_train_errors = np.zeros(runs)
        all_test_errors = np.zeros(runs)
        
        # run multiple iterations for averaging
        for this_run in tqdm(range(runs), 'run'):
            
            # split the dataset into train and test
            train_set, test_set = train_test_split(data, test_size=0.2)
            train_set = LabelledDataset(train_set)
            test_set = LabelledDataset(test_set)
            
            # initialize kNN
            knn = kNN(train_set, test_set, k)
            
            # calculate and store errors
            train_error = knn.accuracy(train_set.data, train_set.labels)
            test_error = knn.accuracy(test_set.data, test_set.labels)
            all_train_errors[this_run] = train_error
            all_test_errors[this_run] = test_error
            
        # calculate means and standard devs of errors
        train_error_mean = np.mean(all_train_errors)
        train_error_std = np.std(all_train_errors)
        test_error_mean = np.mean(all_test_errors)
        test_error_std = np.std(all_test_errors)
        
        print('d =', k)
        print('train error: %f ± %f' %(train_error_mean, train_error_std))
        print('test  error: %f ± %f\n' %(test_error_mean, test_error_std))
        
        train_means.append(train_error_mean)
        train_stds.append(train_error_std)
        test_means.append(test_error_mean)
        test_stds.append(test_error_std)
        
    return train_means, train_stds, test_means, test_stds
예제 #17
0
    def __init__(
            self,
            # number of neighbors in knn
            kNeighbors: int,
            # number of clusters
            kValue: int,
            # data to cluster
            dataSet: np.ndarray,
            # 'mixed', 'categorical', or 'real' data set
            data_type: str,
            # list of integers representing categorical feature column indices
            categorical_features: list,
            # True if the data set is a regression data set
            regression_data_set: bool,
            # weight for real value in distance metric
            alpha: int,
            # weight for categorical value in distance metric
            beta: int,
            # bin width for gaussian kernel smoother
            h: float,
            # dimensionality of data set (# features)
            d: int,
            # pass in the test data set at init
            Testdata: np.ndarray):

        # create a Nearest Neighbor object to single nearest neighbor to input data point
        self.nn = kNN.kNN(1, data_type, categorical_features,
                          regression_data_set, alpha, beta, h, d)
        self.knn = kNN.kNN(kNeighbors, data_type, categorical_features,
                           regression_data_set, alpha, beta, h, d)
        self.categorical_features = categorical_features
        # save which features are real as well by deleting categorical indices from a new list
        real_features = list(range(d))
        for i in categorical_features:
            real_features.remove(i)
        self.real_features = real_features
        self.kValue = kValue
        self.dataSet = dataSet
        # dimensionality of data set
        self.d = d
        self.itermax = 10
        self.Testdata = Testdata
        self.initial_medoids = self.choose_random_medoids()
        self.assignments = []
예제 #18
0
def main():
    dataframe = data.getData()
    dataframe = dataframe.get_values()
    mask = np.random.rand(len(dataframe)) < 0.8
    trainingData, testingData = dataframe[mask], dataframe[~mask]
    model = kNN.kNN(8, euclideanDistance, trainingData)
    columns = [x for x in range(3, 11)]

    predictions = model.getPredictions(testingData, columns)
    print(getAccuracy(testingData, predictions))
예제 #19
0
파일: Problem2.py 프로젝트: siyuzhan/ML
def main(train_path="PS1_data/faces.train", test_path="PS1_data/faces.test", 
         k=1):
    knn = kNN.kNN(k=k, fun=kNN.inverse_euclidean_distance)
    knn.train(train_path)
    knn.load_test_file(test_path)
    for i in knn.test_data:
        neighbors = knn.test(i)
        x = complete_face(knn,neighbors)
        matplotlib.pyplot.gray()
        matplotlib.pyplot.imshow(x.reshape((64,64)))
        matplotlib.pyplot.show()
예제 #20
0
파일: Problem2.py 프로젝트: siyuzhan/ML
def main(train_path="PS1_data/faces.train",
         test_path="PS1_data/faces.test",
         k=1):
    knn = kNN.kNN(k=k, fun=kNN.inverse_euclidean_distance)
    knn.train(train_path)
    knn.load_test_file(test_path)
    for i in knn.test_data:
        neighbors = knn.test(i)
        x = complete_face(knn, neighbors)
        matplotlib.pyplot.gray()
        matplotlib.pyplot.imshow(x.reshape((64, 64)))
        matplotlib.pyplot.show()
예제 #21
0
def createModel():
    trainingData = loadDataset.loadTrainingData()

    data = []
    labels = []
    for dataDict in trainingData:
        data.extend(dataDict[b"data"])
        labels.extend(dataDict[b"labels"])

    dataNP = np.asarray(data)
    labelsNP = np.asarray(labels)

    return kNN.kNN(dataNP, labelsNP)
예제 #22
0
def datingClassTest(X, Y):
    testRatio = 0.10
    normMat, minVal, diff = autoNorm(X)
    m = len(X)
    numTestData = int(m * testRatio)
    errorCount = 0.0
    # 900 training | 100 testing
    for i in range(numTestData):
        predict = kNN(normMat[numTestData:m, :], Y[numTestData:m],
                      normMat[i, :], 3)
        print "the classifier came back with: %d, the real answer is: %d" \
            % (predict, Y[i])
        if predict != Y[i]: errorCount += 1
    print "error rate is: %f" % (errorCount / float(numTestData))
예제 #23
0
def main():
    is_sklearn = False
    # kernel = c.COSINE
    # kernel = c.GAUSSIAN
    kernel = c.POLY
    # training parameter
    result_path = 'results/PB2_spam.acc'
    model_name = 'digits_' + kernel
    model_path = 'data/PB1_B_digits_sk_Gaussian_1.model'

    # tr_data_path = 'data\\digits\\tr_f_l.pickle'
    # te_data_path = 'data\\digits\\te_f_l.pickle'
    tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
    te_data_path = 'data\\digits\\te_f_l_10.pickle'
    # laod and preprocess training data
    tr_data = loader.load_pickle_file(tr_data_path)
    te_data = loader.load_pickle_file(te_data_path)

    # transpose label
    tr_data[1] = np.transpose(tr_data[1])[0]
    te_data[1] = np.transpose(te_data[1])[0]

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])


    # start training
    models = []
    st = time.time()

    # start training
    print('{:.2f} Start training.'.format(time.time() - st))

    for k in (1, 3, 7):
        if not is_sklearn:
            clf = kNN.kNN(kernel=kernel)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0], k=k)
            te_pred = clf.predict(te_data[0], k=k)
        else:
            clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances)
            clf.fit(tr_data[0], tr_data[1])
            tr_pred = clf.predict(tr_data[0])
            te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
        models.append(clf)
        print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
예제 #24
0
파일: DE.py 프로젝트: jmread/cerebro
    def __init__(self, N_d, N_o, threshold=5, f=None, f_rwd=rwd_basic):
        # self.h = MLPpf(N_i=len(self.g.get_weight_vector()),N_h=20,N_o=1)
        # self.h = RLS(N_i=len(self.g.get_weight_vector()),N_o=1)
        self.h = kNN(N_i=N_d, N_o=N_o, k=5, n=100)
        # self.h = MLPbp(N_i=N_d,N_h=5,N_o=N_o,f=sigmoid,fo=linear)
        self.f = f  # NON-LINEARITY (OPTIONAL)
        # self.h = RLS(N_d,N_o)

        self.learn_rate = 0.1
        self.w = zeros(N_d)  # random.randn(N_d) * 0.1        # the state (actually a vector of weights)
        self.t = 0  # temperature
        self.threshold = threshold  # deperation threshold
        self._y = -100.0  # old reward/error/target
        self.f_rwd = f_rwd
        self.burn_in = 5
예제 #25
0
def kNN_Validate(dataName, grpName, folds, k=3, d=2, trans=None):
    """
		params: dataName := file with the data set
			grpName := file with the different groupings
			folds := number of folds
			k := number of neigbors to base the classification off of
							where the default is 3
			d := the minkowski distance to use, default is 2
			trans := transformation function to be applied on data set
		objective: performs cross validation using kNN as classifier
				eturns: a list of tuples organized as (test_predicted, test_groundTruth)

	"""
    valid = vd.Validate(grpName, folds)
    data, labels = bd(dataName)
    results = []  #stores tuples: (list_predicted, list_groundTruth)
    for i in range(valid.getFoldCount()):
        print("kNN iteration %d" % i)
        #get the train and test indices of the data set
        testIndex, trainIndex = valid.getTest(i), valid.getTrain(i)
        #build the test set and test labels
        testSet, testLabels = data[testIndex, :], labels[testIndex]
        #build the train set and training labels
        trainSet, trainLabels = data[trainIndex, :], labels[trainIndex]
        #if the data is to be transformed
        if trans is not None:
            if trans is fld:
                tmp = trans(trainSet, trainLabels)
                trainSet = np.matmul(trainSet, tmp)
                trainSet = trainSet.reshape(-1, 1).astype(np.float64)
                testSet = np.matmul(testSet, tmp)
                testSet = testSet.reshape(-1, 1).astype(np.float64)
            else:
                tmp = trans(trainSet).transpose()
                trainSet = np.matmul(trainSet, tmp)
                testSet = np.matmul(testSet, tmp)
        #standardize the training and test set
        trainSet, testSet = standard(trainSet, testSet)
        #classify test set and add it to the results list
        results.append((knn.kNN(trainSet, testSet, trainLabels, k,
                                d), testLabels))
    results = ev.buildConfusionMatrices(results)
    results = ev.normalizeConfMat(results)
    results = ev.getAvgProbMatrix(results)
    print("knn results", results)
    results = ev.rocData(results)
    print("%d-NN Accuracy: %f" % (k, results["Acc"]))
    return results
예제 #26
0
def partA():
    TRAIN_TITLES, toss = load_titles()
    knn = kNN.kNN(10, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.train")
    n = TRAIN_TITLES.index(
        'title-Fifty Shades of Grey: Book One of the Fifty Shades Trilogy\n')
    m = TRAIN_TITLES.index('title-Brains: A Zombie Memoir\n')
    x = knn.test_num(n)
    y = knn.test_num(m)
    a = [TRAIN_TITLES[n[0]] for n in x]
    b = [TRAIN_TITLES[n[0]] for n in y]
    print "Fifty Shades of Grey is most similar to:"
    print string.join(a)
    print "Brains: A Zombie Memoir is most similar to:"
    print string.join(b)
예제 #27
0
    def recommend_songs(self, songnumber):
        if use_pca:
            # Get reduced (2 dimensions) data using PCA
            # start_time()
            self.transformed = PCA(self.X)
            # stop_time("PCA")
        elif use_lem:
            # Get reduced (2 dimensions) data using LEM
            # start_time()
            self.transformed = LEM(self.X)
            # stop_time("LEM")

        # Get seed data point
        self.p = self.transformed[songnumber]
        # Get 20 nearest neighbors of seed
        self.idx = kNN(self.transformed, self.p, 20)[0]
예제 #28
0
파일: Problem3.py 프로젝트: siyuzhan/ML
def partA():
    TRAIN_TITLES, toss = load_titles()
    knn = kNN.kNN(10, kNN.cosine_similarity)
    knn.train("PS1_data/books.train")
    knn.load_test_file("PS1_data/books.train")
    n = TRAIN_TITLES.index(
            'title-Fifty Shades of Grey: Book One of the Fifty Shades Trilogy\n'
            )
    m = TRAIN_TITLES.index('title-Brains: A Zombie Memoir\n')
    x = knn.test_num(n)
    y = knn.test_num(m)
    a = [TRAIN_TITLES[n[0]] for n in x]
    b = [TRAIN_TITLES[n[0]] for n in y]
    print "Fifty Shades of Grey is most similar to:"
    print string.join(a)
    print "Brains: A Zombie Memoir is most similar to:"
    print string.join(b)
예제 #29
0
def knn_worker(q, fold, data_set):
    # get the ten folds
    tenFolds = experimental_data_sets[data_set][3]
    # pick the fold according to this run
    test = copy.deepcopy(tenFolds[fold])
    #Append all data folds to the training data set
    remaining_folds = [x for i, x in enumerate(tenFolds) if i != fold]
    training = np.concatenate(remaining_folds)

    data_dimension = len(experimental_data_sets[data_set][0]) - 1
    if feature_data_types[data_set] != "mixed":
        alpha = 1
        beta = 1
    else:
        alpha = 1
        beta = alpha * tuned_delta_value[data_set]

    knn = kNN.kNN(
        #Feed in the square root of the length
        tuned_k[data_set],
        # supply mixed, real, categorical nature of features
        feature_data_types[data_set],
        #Feed in the categorical attribute indicies stored in a global array
        categorical_attribute_indices[data_set],
        #Store the data set key for the dataset name
        regression_data_set[data_set],
        # weight for real distance
        alpha=alpha,
        # weight for categorical distance
        beta=beta,
        # kernel window size
        h=tuned_bin_value[data_set],
        #Set the dimensionality of the data set in KNN
        d=data_dimension)
    # Classify the fold against the remaining training data
    classifications = knn.classify(training, test)
    metadata = ["KNN", data_set]
    # calculate the performance
    results_set = results.LossFunctionPerformance(
        regression_data_set[data_set], classifications)
    data_point = metadata + results_set
    data_point_string = ','.join([str(x) for x in data_point])
    # queue the results and return them
    q.put(data_point_string)
    return (data_point_string)
예제 #30
0
def cNN(dataPool, labelPool):
    prototypePool = list()
    protoLabelPool = list()

    # put the first data points in the STORE
    prototypePool.append(dataPool.pop())
    protoLabelPool.append(labelPool.pop())

    for index, data in enumerate(dataPool):
        dataLabel = labelPool[index]

        nearestPrototypeLabel = kNN(data, prototypePool, protoLabelPool, 1)

        if nearestPrototypeLabel != dataLabel:
            prototypePool.append(dataPool[index])
            protoLabelPool.append(labelPool[index])

    return prototypePool, protoLabelPool
예제 #31
0
파일: PB5_RELIEF.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    is_sklearn = True
    k = 10  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # training_data[0] = preprocessing.scale(training_data[0])


    # start training
    training_errs = []
    testing_errs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in (0,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        kernel = c.EUCLIDEAN
        # kernel = c.GAUSSIAN
        f_select = True
        best_features_num = 5
        clf = kNN.kNN(kernel=kernel)
        clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num)
        print("Best features: {}".format(clf.best_f_indices))
        for kk in (1, 2, 3, 7):
            tr_pred = clf.predict(tr_data[0], k=kk)
            te_pred = clf.predict(te_data[0], k=kk)

            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
예제 #32
0
파일: PB2_A_spam.py 프로젝트: Juncai/CS6140
def main():
    # training parameter
    k = 8  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0])


    # start training
    training_accs = []
    testing_accs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    kernel = c.EUCLIDEAN
    sst = time.time()
    for i in (1,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        for r in (2.5, 2.7):
            clf = kNN.kNN(kernel=kernel)
            # clf.fit(training_data[0], training_data[1])
            clf.fit(tr_data[0], tr_data[1])
            # tr_pred = clf.predict(training_data[0], r=r)
            tr_pred = clf.predict(tr_data[0], r=r)
            te_pred = clf.predict(te_data[0], r=r)

            # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0]
            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            testing_accs.append(te_acc)
            print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
예제 #33
0
def main():
    fop = Files()
    #获取训练数据
    ret, digitsVec, labelVec = fop.txt2mat('digits/trainingDigits')

    knnClass = kNN()
    path = 'digits/testDigits'
    testList = os.listdir(path)
    total = len(testList)
    errCnt = 0
    for index in range(total):
        name = testList[index]
        label = name.split('.')[0][0]
        tmp = path + '/' + name
        digits = fop.readMat(tmp, 32, 32)
        result = knnClass.classification(digits, digitsVec, labelVec, 3)
        if label != result:
            errCnt += 1.0
    err = errCnt / total
    return
예제 #34
0
def main():
    classifier = kNN()

    dataset = get_dataset()
    dataset = list(form(dataset))
    random.shuffle(dataset)
    training = dataset[:100]
    testing = dataset[100:]

    classifier = train(classifier, training)
    before = classifier.size
    classifier.compress(3.0)
    after = classifier.size

    correct = 0
    for feature, label in testing:
        seems = classifier.classify(feature)
        if seems == label:
            correct += 1

    print(correct/len(testing), before, after)
예제 #35
0
def predict(symbImgs, topology):
    # Flatten using knn first
    knn = kNN.kNN()
    flat_syms = knn.flatten_symbols(symbImgs)
    binary_symbs = preprocessor.binarize(flat_syms)
    flattened = logistic_regression.flatten(binary_symbs)
    knn.load_MNIST()
    m_data = logistic_regression.flatten(knn.mnist_data)
    m_labels = knn.mnist_labels
    nn = neural_network.NeuralNet(topology)
    print("> Fitting NN")
    m_labels = toArray(m_labels[:10000])
    nn.fit(m_data[:10000], m_labels)

    results = []
    print("> Predicting NN")
    for i in range(1, len(flattened) + 1):
        if i % 1000 == 0:
            print(i)
        results.append(nn.predict(flattened[i - 1:i]))
    return results
예제 #36
0
파일: app.py 프로젝트: TeamSirius/Utilities
def APS():
    #Takes a posted parameter of format:
    #{"lid":location_id, "APS":[ {"MAC":MAC, "strength":STRENGTH, "std": STD},... ]}

    #TODO: CHANGE NAME OF THIS OR /aps
    lid = -1
    try:
        data = request.get_json(force=True) #TODO: REMOVE FORCE IF POSSIBLe
        lid = int(data['lid'])
        if lid < 1:
            from kNN import kNN, AccessPoint
            from datetime import datetime
            knnData = {}
            APS = {}
            for item in data["APS"]:
                if 'std' in item:
                    APS[item['MAC']] = AccessPoint( (item['MAC'], float(item['strength']), float(item['std']), datetime.now(), 10) )
                else:
                   APS[item['MAC']] = AccessPoint( (item['MAC'], float(item['strength']), 0, datetime.now(), 10) )
            (x, y,floor) = kNN(APS)
            # cur.execute("""INSERT into demhoes (x,y, recorded)
            #         VALUES ( %s, %s, NOW() )""", [x,y]) #UTC TIME
            return json.dumps({'success': {"x" : x, "y" : y, "floor":floor}})
        else:
            cur.execute("""SELECT count(*) from accesspoint where location_id=%s""",[lid])
            count = cur.fetchone()[0]
            if not count or int(count) == 0: #Will only log new data -- if already logged will ignore
                for item in data["APS"]:
                    if 'std' in item:
                        cur.execute("""INSERT into accesspoint (MAC, strength, location_id, std_dev, recorded)
                            VALUES ( %s, %s, %s,%s, NOW() )""",
                            [item['MAC'], float(item['strength']),lid, float(item['std'])] ) #UTC TIME
                    else:
                        cur.execute("""INSERT into accesspoint (MAC, strength, location_id, std_dev, recorded)
                            VALUES ( %s, %s, %s,%s, NOW() )""",
                            [item['MAC'], float(item['strength']),lid, -1] ) #UTC TIME
    except Exception, e:
        handle_exception(e)
        return ERROR_RETURN
예제 #37
0
def main():

    ckNN = kNN(enableLoggingTime=True)
    #load data
    ckNN.loadData(fileName='../data/creditcard.csv', feaRowEnd=30808)
    #ckNN.dataConvertToNumpy()
    #split train and test by 80-20 ratio
    ckNN.testTrainSplit(ckNN.feature, ckNN.Class, test_size=0.3)
    #load model
    ckNN.loadModel(n_neighbors=3)
    print(ckNN.toString())
    #train model
    ckNN.trainModel(featureTrain=ckNN.fTrain, classTrain=ckNN.cTrain)
    #test model
    ckNN.classPred = ckNN.testModel(featureTest=ckNN.fTest)
    #metrices
    accuracy, matConf, matCohenKappa, \
    strClassificationReport = ckNN.getMetrics( classTest = ckNN.cTest,
                                                 classPred = ckNN.classPred,
                                                 boolPrint = True)
    # cmap figure generation for confusion matrix
    ckNN.printConfusionMatrix(matConf)
def LEM(X, ndim=2, k=4):
    N = X.shape[0]  # Number of data vectors
    d = X.shape[1]  # Number of dimensions

    # Check if ndim is larger or equal to current dimensions
    if ndim >= d:
        ndim = d - 1

    # Create adjacencey matrix with weights
    W = np.zeros((N, N))
    for i in range(N):
        idx, eucdst = kNN(X[i], X,
                          k)  # Get k nearest neighbours and distance array
        for j in range(k):
            # Weight with heat e**(||x1-x2||**2/t) where t = 200
            heat = np.exp(-eucdst[idx[j]]**2 / 200)
            W[i, idx[j]] = heat
            W[idx[j], i] = heat
    # #  Alternative: weight with {0,1}
    #  W[i, idx[k]] = 1
    #  W[idx[k], i] = 1

    # Create diagonal weight matrix (with column sums of W)
    D = np.diag(W.sum(axis=1))

    # Create laplacian matrix
    L = D - W

    # Get eigenvalues and eigenvectors of laplacian matrix (use linalg.eigh since L is symmetric)
    eigval, eigvec = np.linalg.eigh(L)
    eigval = np.real(eigval)
    eigvec = np.real(eigvec)

    # Get array of eigenvalue indices sorted by smallest values in eigenvalue array
    index = eigval.argsort()

    # Return embedded matrix (ignore first eigenvector since its constant)
    transformed = eigvec[:, index[1:ndim + 1]]
    return transformed
예제 #39
0
def tune_knn_parallel_worker(q, data_set: str, k_value: int, delta_value: int,
                             bin_value: int):
    # print('inside function', data_set, k_value)
    data_dimension = tuning_data_dict[data_set].shape[1] - 1
    if feature_data_types[data_set] != "mixed":
        alpha = 1
        beta = 1
    else:
        alpha = 1
        beta = alpha * delta_value
    # print("this far", data_set)
    knn = kNN.kNN(
        #k value
        k_value,
        # supply mixed, real, categorical nature of features
        feature_data_types[data_set],
        #Feed in the categorical attribute indicies stored in a global array
        categorical_attribute_indices[data_set],
        #Store the data set key for the dataset name
        regression_data_set[data_set],
        # weight for real distance
        alpha,
        # weight for categorical distance
        beta,
        # kernel window size
        bin_value,
        #Set the dimensionality of the data set in KNN
        data_dimension)

    classifications = knn.classify(tuning_data_dict[data_set],
                                   tuning_data_dict[data_set])
    metadata = [data_set, k_value, beta / alpha, bin_value]
    results_set = results.LossFunctionPerformance(
        regression_data_set[data_set], classifications)
    data_point = metadata + results_set
    data_point_string = ','.join([str(x) for x in data_point])
    # put the result on the multiprocessing queue
    q.put(data_point_string)
예제 #40
0
def handwritingClassTest():
    hwLabels = []
    trainingFileList = os.listdir('2.KNN/trainingDigits')
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        classNumStr = int(fileNameStr.split('.')[0].split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i, :] = img2vector('2.KNN/trainingDigits/%s' % fileNameStr)
    testFileList = os.listdir('2.KNN/testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        classNumStr = int(fileNameStr.split('.')[0].split('_')[0])
        vector = img2vector('2.KNN/testDigits/%s' % fileNameStr)
        predict = kNN(trainingMat, hwLabels, vector, 3)
        print "the classifier came back with: %d, the real answer is: %d" % (
            predict, classNumStr)

        if (predict != classNumStr): errorCount += 1.0
    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount / float(mTest))
def handwriting():
    dataset_path = '../../../datasets/Digits/'
    training_list = os.listdir(os.path.join(dataset_path, "trainingDigits"))
    test_list = os.listdir(os.path.join(dataset_path, "testDigits"))

    train_len = len(training_list)
    test_len = len(test_list)

    trainingMat = np.zeros((train_len, 1024))

    train_labels = []
    test_labels = []

    for i in range(train_len):
        splited_path = training_list[i].split('.')[0]
        label = int(splited_path.split('_')[0])
        train_labels.append(label)

        train_path = os.path.join(dataset_path, "trainingDigits",
                                  training_list[i])
        trainingMat[i, :] = get_np_image(train_path).reshape(-1)

    for i in range(test_len):
        splited_path = test_list[i].split('.')[0]
        label = int(splited_path.split('_')[0])
        test_labels.append(label)

        test_path = os.path.join(dataset_path, "testDigits", test_list[i])
        #print(len(img2vector(test_path)[0]))

        testVector = get_np_image(test_path).reshape(-1)
        label_pre = kNN.kNN(trainingMat, testVector, 3, train_labels)
        print(label_pre, test_labels[i])

    print(train_labels)
    print(test_labels)
예제 #42
0
파일: kNN_hwPredict.py 프로젝트: RayYoh/KNN
def hwClassTrain():
    '''
    手写数字识别训练函数
    '''
    #训练集路径
    trainDir = r'D:\VScodePython\机器学习算法\k近邻\手写数字识别\trainingDigits'
    #读取训练集的所有文件名
    trainFileList = listdir(trainDir)
    #训练集训练样本个数
    numOfTrain = len(trainFileList)
    #初始化训练样本标签
    hwLabels = []
    #初始化训练集数据
    trainMat = np.zeros((numOfTrain, 1024))
    for i in range(numOfTrain):
        fileName = trainFileList[i]
        #读取数据
        trainMat[i, :] = img2vec((trainDir + '\\' + fileName))
        #取文件名的第一位为标签 例:0_0.txt
        hwLabels.append(int(fileName.split('.')[0].split('_')[0]))
    #全局变量
    global model
    #使用自己编写的kNN
    model = kNN(trainMat, hwLabels)
    """Read the first 32 characters of the first 32 rows of an image file.

    @return <ndarray>: a 1x(1024+1) numpy array with data and label, while the
                       label is defaults to 0.
    """
    img = ""
    for line in open(img_fn).readlines()[:32]:
        img += line[:32]

    # labels are always attached at the last position
    itera = [_ for _ in img + str(label)]
    return numpy.fromiter(itera, "f4")


if __name__ == "__main__":
    training_set_files = os.listdir(r"./trainingDigits")

    # initiate a matrix, don't forget to allocate the space for the label
    # 32 row x 32 col + label
    training_set = numpy.zeros((len(training_set_files), 32*32+1))

    for i in xrange(len(training_set_files)):
        # e.g. with filename 0_1.txt label is 0
        image_file = r"./trainingDigits/" + training_set_files[i]
        label = training_set_files[i].split('_')[0]
        training_set[i, :] = img_to_vector(image_file, label)

    knn = kNN.kNN(3, training_set, False)
    for fn in os.listdir(r"./testDigits"):
        print knn.classify(img_to_vector(r"./testDigits/%s" % fn)), ", correct number is %s" % fn.split('_')[0]
예제 #44
0
xTrain = xDat[train_obs]
yTrain = yDat[train_obs]

# create val set
xVal = xDat[val_obs]
yVal = yDat[val_obs]

# create test set
xTest = xDat[test_obs]
yTest = yDat[test_obs]


# find hyperparameters that work best on validation set
val_accuracies = []
for k in [1,2,3,5,10,20,50,100]:
    pred = kn.kNN(xTrain,xVal,yTrain,yVal,k)
    acc = np.mean(pred == yVal.T)
    print("accuracy: %f" % (acc,))
    val_accuracies.append((k, acc))
val_accuracies
# tie between 2 and 3, will use 2 (simpler model is better)


# test with best working hyperparameters
# best is with k = 2
prediction = kn.kNN(xTrain, xTest, yTrain, yTest, k = 5)
np.mean(prediction == yTest.T)

# attempt at line profiling 
%load_ext line_profiler
%lprun -s -f kn.kNN -T lp_results.txt kn.kNN(xTrain, xTest, yTrain, yTest, k = 2)
def main():
    # ================================================
    # Load pre-trained model and remove higher level layers
    # ================================================
    print("Loading VGG19 pre-trained model...")
    base_model = VGG19(weights='imagenet')
    model = Model(input=base_model.input,
                  output=base_model.get_layer('block4_pool').output)

    # ================================================
    # Read images and convert them to feature vectors
    # ================================================
    imgs, filename_heads, X = [], [], []
    path = "db"
    print("Reading images from '{}' directory...\n".format(path))
    for f in os.listdir(path):

        # Process filename
        filename = os.path.splitext(f)  # filename in directory
        filename_full = os.path.join(path,f)  # full path filename
        head, ext = filename[0], filename[1]
        if ext.lower() not in [".jpg", ".jpeg"]:
            continue

        # Read image file
        img = image.load_img(filename_full, target_size=(224, 224))  # load
        imgs.append(np.array(img))  # image
        filename_heads.append(head)  # filename head

        # Pre-process for model input
        img = image.img_to_array(img)  # convert to array
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        features = model.predict(img).flatten()  # features
        X.append(features)  # append feature extractor

    X = np.array(X)  # feature vectors
    imgs = np.array(imgs)  # images
    print("imgs.shape = {}".format(imgs.shape))
    print("X_features.shape = {}\n".format(X.shape))

    # ===========================
    # Find k-nearest images to each image
    # ===========================
    n_neighbours = 5 + 1  # +1 as itself is most similar
    knn = kNN()  # kNN model
    knn.compile(n_neighbors=n_neighbours, algorithm="brute", metric="cosine")
    knn.fit(X)

    # ==================================================
    # Plot recommendations for each image in database
    # ==================================================
    output_rec_dir = os.path.join("output", "rec")
    if not os.path.exists(output_rec_dir):
        os.makedirs(output_rec_dir)
    n_imgs = len(imgs)
    ypixels, xpixels = imgs[0].shape[0], imgs[0].shape[1]
    for ind_query in range(n_imgs):

        # Find top-k closest image feature vectors to each vector
        print("[{}/{}] Plotting similar image recommendations for: {}".format(ind_query+1, n_imgs, filename_heads[ind_query]))
        distances, indices = knn.predict(np.array([X[ind_query]]))
        distances = distances.flatten()
        indices = indices.flatten()
        indices, distances = find_topk_unique(indices, distances, n_neighbours)

        # Plot recommendations
        rec_filename = os.path.join(output_rec_dir, "{}_rec.png".format(filename_heads[ind_query]))
        x_query_plot = imgs[ind_query].reshape((-1, ypixels, xpixels, 3))
        x_answer_plot = imgs[indices].reshape((-1, ypixels, xpixels, 3))
        plot_query_answer(x_query=x_query_plot,
                          x_answer=x_answer_plot[1:],  # remove itself
                          filename=rec_filename)

    # ===========================
    # Plot tSNE
    # ===========================
    output_tsne_dir = os.path.join("output")
    if not os.path.exists(output_tsne_dir):
        os.makedirs(output_tsne_dir)
    tsne_filename = os.path.join(output_tsne_dir, "tsne.png")
    print("Plotting tSNE to {}...".format(tsne_filename))
    plot_tsne(imgs, X, tsne_filename)
예제 #46
0
파일: demo.py 프로젝트: vikasrtr/kNN
# lets take only first two columns
X = data.iloc[:, :2].values
y = data.iloc[:, -1]

# convert to floats 0,1,2
y = y.apply(lambda x: 0 if x == 'Iris-setosa' else x)
y = y.apply(lambda x: 1 if x == 'Iris-versicolor' else x)
y = y.apply(lambda x: 2 if x == 'Iris-virginica' else x)

y = y.values

n_neighbors = 10

# ======================================
# my kNN
cl = kNN(n_neighbors)
cl.fit(X, y)


# ======================================
# scikit-learn
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X, y)


# Plot decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
				arrIndex[1] = n
				return arrIndex

# Run kNN algorithm
k = 1
predictedDigits = zeros(testData.shape[0])
digits = zeros(testData.shape[0])
Label_start_stop = zeros((testData.shape[0],2))
Predicted_start_stop = zeros((testData.shape[0],2))
Error_start_stop = zeros((testData.shape[0],2))

for i in range(testData.shape[0]):
    print "Current Test Instance: " + str(i+1)
    print "test data " + str(testData[i]) + "\nLabels " + str(testLabels[i])
    digits[i] = i
    predictedDigits[i] = kNN(k, trainingData, trainingLabels, testData[i,:])
    print "Predicted " + str(predictedDigits[i])
    arrayIndex_Label = time_startstop(testLabels[i])
    Label_start_stop[i,0] = arrayIndex_Label[0]
    Label_start_stop[i,1] = arrayIndex_Label[1]
    arrayIndex_Predicted = time_startstop(predictedDigits[i])
    Predicted_start_stop[i,0] = arrayIndex_Predicted[0]
    Predicted_start_stop[i,1] = arrayIndex_Predicted[1]
    Error_start_stop[i,0] = abs(arrayIndex_Label[0] - arrayIndex_Predicted[0])
    Error_start_stop[i,1] = abs(arrayIndex_Label[1] - arrayIndex_Predicted[1])
    print "start " + str(arrayIndex_Predicted[0]) + " stop " + str(arrayIndex_Predicted[1])
    #print str(i) + " " + str(arrayIndex_Label[0]) + " " + str(arrayIndex_Label[1]) + " " + str(arrayIndex_Predicted[0]) + " " + str(arrayIndex_Predicted[1]) + " " + str(abs(arrayIndex_Label[0] - arrayIndex_Predicted[0])/2.0) + " " + str(abs(arrayIndex_Label[1] - arrayIndex_Predicted[1])/2.0)
    
#plot the Predicted label along with error
f1 = plt.figure()
f2 = plt.figure()