def __init__(self, #Set an error value error: float, #Set the k number of neighbors k: int, #Pass in the data type data_type: str, #Set the categorical features categorical_features: list, #Set a list of features in the regression data set regression_data_set: bool, #Set the alpha float alpha:float, #Set the beta float beta:float, #set the width value h:float, #Set the dimensionality d:int): # initialize a knn object self.knn = kNN.kNN(k, data_type, categorical_features, regression_data_set, alpha, beta, h, d) self.nn = kNN.kNN(1, data_type, categorical_features, regression_data_set, alpha, beta, h, d) # error threshhold for regression classification self.error = error # store if this data set is a regression data set (True) or not (False) self.regression_data_set = regression_data_set self.results = Results.Results()
def __init__( self, # number of neighbors in knn kNeighbors: int, # number of clusters kValue: int, # data to cluster dataSet: np.ndarray, # 'mixed', 'categorical', or 'real' data set data_type: str, # list of integers representing categorical feature column indices categorical_features: list, # True if the data set is a regression data set regression_data_set: bool, # weight for real value in distance metric alpha: int, # weight for categorical value in distance metric beta: int, # bin width for gaussian kernel smoother h: float, # dimensionality of data set (# features) d: int, name: str, Testdata: np.ndarray): # create a Nearest Neighbor object to single nearest neighbor to input data point self.nn = kNN.kNN(1, data_type, [], regression_data_set, alpha, beta, h, d) self.knn = kNN.kNN(kNeighbors, data_type, [], regression_data_set, alpha, beta, h, d) self.categorical_features = [] self.itermax = 5 self.kValue = kValue #Convert the data such that no categorical values are in the data set for j in range(len(Testdata)): Testdata[j] = self.ConvertData(Testdata[j], name) #Set the test set value self.Testdata = Testdata #Convert the data such that no categorical values are in the data set for j in range(len(dataSet)): dataSet[j] = self.ConvertData(dataSet[j], name) #Set the data set value self.dataSet = dataSet #Set the dimensionality self.d = d #If the data set is machine if name == "machine": self.dataSet = self.dataSet[:, 2:] self.Testdata = self.Testdata[:, 2:] self.d = d - 2 # dimensionality of data set # save which features are real as well by deleting categorical indices from a new list real_features = list(range(d)) #Remove all of the categorical values for i in categorical_features: real_features.remove(i) #SEt the real features object variable self.real_features = real_features
def addToGraph(L1_concept, depth, language='en'): lang_set = {"en", "sp"} L2_name = min(lang_set - {language}) # get the L2 name L1_vertex = g.get_vertex(L1_concept, all_vertices) if not isinstance(L1_vertex, Vertex): sys.exit("Fatal Error: instance is not a vertex") # Translate L1 concept to get L2 concept and add to L2 evoking list if language == 'en': L2_concept = (en2sp(L1_concept[0]), L1_concept[1]) updateConcepts(L2_concept, evoked_sp, evoking_sp) else: L2_concept = (sp2en(L1_concept[0]), L1_concept[1]) updateConcepts(L2_concept, evoked_en, evoking_en) # BASE CASE (no more levels to be added) if depth == 0: if language == 'en': L2_concept = (en2sp(L1_concept[0]), L1_concept[1]) else: L2_concept = (sp2en(L1_concept[0]), L1_concept[1]) addTranslation(L2_concept, L1_vertex, L2_name) return else: if language == 'en': # List of tuples of (name, distance_to_parent_concept) L1_evocations = kNN(L1_concept[0], evoked_en, evoking_en, k) L2_evocations = kNN(L2_concept[0], evoked_sp, evoking_sp, k) else: L1_evocations = kNN(L1_concept[0], evoked_sp, evoking_sp, k) L2_evocations = kNN(L2_concept[0], evoked_en, evoking_en, k) # Add L1 and L2 list of most similar words to graph addEvocations(L1_vertex, L1_evocations, language) L2_vertex = addTranslation(L2_concept, L1_vertex, L2_name) addEvocations(L2_vertex, L2_evocations, L2_name) for evocation in L1_evocations: # Add evocations of given concept to concept_set if language == 'en': updateConcepts(evocation, evoked_en, evoking_en) addToGraph(evocation, depth-1, 'en') else: updateConcepts(evocation, evoked_sp, evoking_sp) addToGraph(evocation, depth-1, 'sp') for evocation in L2_evocations: if language == 'en': updateConcepts(evocation, evoked_sp, evoking_sp) addToGraph(evocation, depth-1, 'sp') else: updateConcepts(evocation, evoked_en, evoking_en) addToGraph(evocation, depth-1, 'en')
def handWritingTest(): # 定义标签 labels = [] # 读取文件目录 fileList = os.listdir('trainingDigits') size = len(fileList) hwMat = np.zeros((size, 1024)) for i in range(size): # 处理标签 fileName = fileList[i] labels.append(fileName.split('_')[0]) hwMat[i, :] = img2vector('trainingDigits/%s' % fileName) testFileList = os.listdir('testDigits') m = len(testFileList) errorCount = 0.0 for i in range(m): fileName = testFileList[i] label = fileName.split('_')[0] inputData = img2vector('testDigits/%s' % fileName) result = knn.kNN(inputData, hwMat, labels, 3) if label is result: print("its Ok") else: errorCount += 1 print("keep Going") print(errorCount / m)
def main(): ckNN = kNN( enableLoggingTime=True ) #load data ckNN.loadData( fileName = '../data/creditcard.csv', feaRowEnd = 284808) # feature scaling ckNN.scaleFeature( minm=0, maxm=1 ) #Feature reduction (loading previously saved data) feaSelecData = ckNN.loadVariables( 'featureExtractAll' ) ckNN.selectImportantFeatures( feaSelecData['selectedIndices'] ) ckNN.kSweep = [1, 3, 5] # do double cross ValScoreList, \ ValScoreStdList, \ TestScoreList, \ bestParamList, \ allData = ckNN.doubleCrossValidate(ckNN.featureNumpy, ckNN.ClassNumpy, nFoldOuter=5, nFoldInner=4, scoring=scoring.MCC, isStratified = True, fileName='kNN/kNNData') print "Validation Avg. Score for outer folds with best param: \n" print ValScoreList print "Validation Score Std. for outer folds with best param: \n" print ValScoreStdList print "Test Avg. Score for outer folds with best param: \n" print TestScoreList print "Best Param list for outer folds: \n" print bestParamList
def predict(): #symbImgs = symbols.getSymbolImages(binaryImages, symbolCenters, 28, 28) #Flatten using knn first knn = kNN() print "knn" flat_syms = knn.flatten_symbols(symbImgs) print "flat" binary_symbs = preprocessor.binarize(np.array(flat_syms)) print "bin" knn.load_MNIST() print "loaded" m_data = flatten(knn.mnist_data) m_labels = knn.mnist_labels print "Dat" f_symbs = flatten(binary_symbs) fix_data_len(f_symbs) print "fb" LR = linear_model.LogisticRegression() inst = LR.fit(m_data[4800:], m_labels[4800:]) results = [] for i in range(1, len(f_symbs) + 1): if i % 1000 == 0: print i results.append(inst.predict(f_symbs[i - 1:i])) return results
def main(): # ================================================ # Load pre-trained model and remove higher level layers # ================================================ base_model = VGG19(weights='imagenet') model = Model(input=base_model.input, output=base_model.get_layer('block4_pool').output) source_path = "db" source_paths = np.array( list( filter(lambda path: os.path.splitext(path)[1] in ['.jpg', '.jpeg'], np.array(os.listdir(source_path))))) # ================================================ # Read images and convert them to feature vectors # ================================================ imgs, filename_heads, X = [], [], [] for f in [sys.argv[1]]: # Process filename filename = os.path.splitext(f) # filename in directory head, ext = filename[0], filename[1] if ext.lower() not in [".jpg", ".jpeg"]: continue # Read image file img = image.load_img(f, target_size=(224, 224)) # load imgs.append(np.array(img)) # image filename_heads.append(head) # filename head # Pre-process for model input img = image.img_to_array(img) # convert to array img = np.expand_dims(img, axis=0) img = preprocess_input(img) features = model.predict(img).flatten() # features X.append(features) # append feature extractor X = np.array(X) # feature vectors imgs = np.array(imgs) # images # =========================== # Find k-nearest images to each image # =========================== n_neighbours = 3 pre_X = np.load('feature_matrix.npy') knn = kNN() # kNN model knn.compile(n_neighbors=n_neighbours, algorithm="brute", metric="cosine") knn.fit(pre_X) n_imgs = len(imgs) ypixels, xpixels = imgs[0].shape[0], imgs[0].shape[1] for ind_query in range(n_imgs): # Find top-k closest image feature vectors to each vector distances, indices = knn.predict(np.array([X[ind_query]])) distances = distances.flatten() indices = indices.flatten() indices, distances = find_topk_unique(indices, distances, n_neighbours) print(json.dumps(source_paths[indices].flatten().tolist())) print(distances)
def main(): kernel = c.COSINE # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (0.15, 0.1): clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
def partC(k = 5): train_titles, test_titles = load_titles() knn = kNN.kNN(10, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.test") results = [knn.choose_label(i) for i in knn.test_data] return results
def partC(k=5): train_titles, test_titles = load_titles() knn = kNN.kNN(10, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.test") results = [knn.choose_label(i) for i in knn.test_data] return results
def predict_missing_elements(data, k=5): import kNN # first, find the columns that have missing values isNaN = np.isnan(data) colHasNan = np.any(isNaN, axis=0) # print(colHasNan) # now, do column-wise interpretation model = kNN.kNN(k, method='regression') for i, col in enumerate(colHasNan): if (col): # print('predicting col', i, data[:,i], isNaN[:,i]) # delete rows that have NaN in column i train = np.delete(data, isNaN[:, i], axis=0) # fit a kNN model to the data # delete column i - the column to predict model.fit(np.delete(train, i, axis=1), train[:, i]) # this is inefficient but much easier to code # predict the value for all elements of column i p = model.predict(np.delete(data, i, axis=1)) # if data[row,col] is None {data[row,col] = p} else {data[row,col] = data[row,col]} data[:, i] = np.where(isNaN[:, i], p, data[:, i]) # print('predicted:', data[:,i]) return data
def partB(): knn = kNN.kNN(1, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.test") correct_pred, predict, true = [0 for i in range(5) ], [0 for i in range(5) ], [0 for i in range(5)] precision, recall = [0 for i in range(5)], [0 for i in range(5)] genre_centroid = kNN.get_genre_centroid(knn.training_data, knn.training_labels) for i in range(len(knn.test_data)): sim_max, label = -1, -1 for j in range(len(genre_centroid)): sim = kNN.cosine_similarity(knn.test_data[i], genre_centroid[j]) if (sim > sim_max): sim_max = sim label = j if (label == int(knn.test_labels[i])): correct_pred[label] += 1 predict[label] += 1 true[int(numpy.asscalar(knn.test_labels[i]))] += 1 accuracy = float(sum(correct_pred)) / len(knn.test_data) for i in range(5): if predict[i] != 0: precision[i] = float(correct_pred[i]) / predict[i] if true[i] != 0: recall[i] = float(correct_pred[i]) / true[i] return accuracy, precision, recall
def partB(): knn = kNN.kNN(1, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.test") correct_pred, predict, true = [0 for i in range(5)], [0 for i in range(5)], [0 for i in range(5)] precision, recall = [0 for i in range(5)], [0 for i in range(5)] genre_centroid = kNN.get_genre_centroid(knn.training_data, knn.training_labels) for i in range(len(knn.test_data)): sim_max, label = -1, -1 for j in range(len(genre_centroid)): sim = kNN.cosine_similarity(knn.test_data[i], genre_centroid[j]) if (sim > sim_max): sim_max = sim label = j if (label == int(knn.test_labels[i])): correct_pred[label] += 1 predict[label] += 1 true[int(numpy.asscalar(knn.test_labels[i]))] +=1 accuracy = float(sum(correct_pred))/len(knn.test_data) for i in range(5): if predict[i] != 0: precision[i] = float(correct_pred[i])/predict[i] if true[i] != 0: recall[i] = float(correct_pred[i])/true[i] return accuracy, precision, recall
def kNearest(): print('\nPart 3: k-Nearest Neighbor') k_test = [1, 5, 11, 15, 21] for k in k_test: clf = kNN(X_train, y_train, k=k) preds = clf.predict(X_test) print('k =', k, ';', 'accuracy on test set =', np.mean(preds==y_test))
def classify(trainSet, trainLabels, testSet,k=50): predictedLabels = zeros(testSet.shape[0]) # Use kNN algorithm in order to predict the labels for i in range(testSet.shape[0]): predictedLabels[i] = kNN(k, trainSet, trainLabels, testSet[i]) return predictedLabels
def task1(param_set, runs=20): """ This function produces basic results for the k-NN algorithm Input: param_set: arr or list -- the parameters of the k-Nearest Neighbour algorithm runs: int -- number of runs to average 'best' k over Returns 4 lists containing the train errors, std and test errors, std of all parameters """ # save the means and stds of all parameters train_means = [] train_stds = [] test_means = [] test_stds = [] # loop through the parameters, running multiple iterations per param for k in tqdm(param_set, 'k'): # save errors of this parameter all_train_errors = np.zeros(runs) all_test_errors = np.zeros(runs) # run multiple iterations for averaging for this_run in tqdm(range(runs), 'run'): # split the dataset into train and test train_set, test_set = train_test_split(data, test_size=0.2) train_set = LabelledDataset(train_set) test_set = LabelledDataset(test_set) # initialize kNN knn = kNN(train_set, test_set, k) # calculate and store errors train_error = knn.accuracy(train_set.data, train_set.labels) test_error = knn.accuracy(test_set.data, test_set.labels) all_train_errors[this_run] = train_error all_test_errors[this_run] = test_error # calculate means and standard devs of errors train_error_mean = np.mean(all_train_errors) train_error_std = np.std(all_train_errors) test_error_mean = np.mean(all_test_errors) test_error_std = np.std(all_test_errors) print('d =', k) print('train error: %f ± %f' %(train_error_mean, train_error_std)) print('test error: %f ± %f\n' %(test_error_mean, test_error_std)) train_means.append(train_error_mean) train_stds.append(train_error_std) test_means.append(test_error_mean) test_stds.append(test_error_std) return train_means, train_stds, test_means, test_stds
def __init__( self, # number of neighbors in knn kNeighbors: int, # number of clusters kValue: int, # data to cluster dataSet: np.ndarray, # 'mixed', 'categorical', or 'real' data set data_type: str, # list of integers representing categorical feature column indices categorical_features: list, # True if the data set is a regression data set regression_data_set: bool, # weight for real value in distance metric alpha: int, # weight for categorical value in distance metric beta: int, # bin width for gaussian kernel smoother h: float, # dimensionality of data set (# features) d: int, # pass in the test data set at init Testdata: np.ndarray): # create a Nearest Neighbor object to single nearest neighbor to input data point self.nn = kNN.kNN(1, data_type, categorical_features, regression_data_set, alpha, beta, h, d) self.knn = kNN.kNN(kNeighbors, data_type, categorical_features, regression_data_set, alpha, beta, h, d) self.categorical_features = categorical_features # save which features are real as well by deleting categorical indices from a new list real_features = list(range(d)) for i in categorical_features: real_features.remove(i) self.real_features = real_features self.kValue = kValue self.dataSet = dataSet # dimensionality of data set self.d = d self.itermax = 10 self.Testdata = Testdata self.initial_medoids = self.choose_random_medoids() self.assignments = []
def main(): dataframe = data.getData() dataframe = dataframe.get_values() mask = np.random.rand(len(dataframe)) < 0.8 trainingData, testingData = dataframe[mask], dataframe[~mask] model = kNN.kNN(8, euclideanDistance, trainingData) columns = [x for x in range(3, 11)] predictions = model.getPredictions(testingData, columns) print(getAccuracy(testingData, predictions))
def main(train_path="PS1_data/faces.train", test_path="PS1_data/faces.test", k=1): knn = kNN.kNN(k=k, fun=kNN.inverse_euclidean_distance) knn.train(train_path) knn.load_test_file(test_path) for i in knn.test_data: neighbors = knn.test(i) x = complete_face(knn,neighbors) matplotlib.pyplot.gray() matplotlib.pyplot.imshow(x.reshape((64,64))) matplotlib.pyplot.show()
def main(train_path="PS1_data/faces.train", test_path="PS1_data/faces.test", k=1): knn = kNN.kNN(k=k, fun=kNN.inverse_euclidean_distance) knn.train(train_path) knn.load_test_file(test_path) for i in knn.test_data: neighbors = knn.test(i) x = complete_face(knn, neighbors) matplotlib.pyplot.gray() matplotlib.pyplot.imshow(x.reshape((64, 64))) matplotlib.pyplot.show()
def createModel(): trainingData = loadDataset.loadTrainingData() data = [] labels = [] for dataDict in trainingData: data.extend(dataDict[b"data"]) labels.extend(dataDict[b"labels"]) dataNP = np.asarray(data) labelsNP = np.asarray(labels) return kNN.kNN(dataNP, labelsNP)
def datingClassTest(X, Y): testRatio = 0.10 normMat, minVal, diff = autoNorm(X) m = len(X) numTestData = int(m * testRatio) errorCount = 0.0 # 900 training | 100 testing for i in range(numTestData): predict = kNN(normMat[numTestData:m, :], Y[numTestData:m], normMat[i, :], 3) print "the classifier came back with: %d, the real answer is: %d" \ % (predict, Y[i]) if predict != Y[i]: errorCount += 1 print "error rate is: %f" % (errorCount / float(numTestData))
def main(): is_sklearn = False # kernel = c.COSINE # kernel = c.GAUSSIAN kernel = c.POLY # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel model_path = 'data/PB1_B_digits_sk_Gaussian_1.model' # tr_data_path = 'data\\digits\\tr_f_l.pickle' # te_data_path = 'data\\digits\\te_f_l.pickle' tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training models = [] st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for k in (1, 3, 7): if not is_sklearn: clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], k=k) te_pred = clf.predict(te_data[0], k=k) else: clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] models.append(clf) print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
def __init__(self, N_d, N_o, threshold=5, f=None, f_rwd=rwd_basic): # self.h = MLPpf(N_i=len(self.g.get_weight_vector()),N_h=20,N_o=1) # self.h = RLS(N_i=len(self.g.get_weight_vector()),N_o=1) self.h = kNN(N_i=N_d, N_o=N_o, k=5, n=100) # self.h = MLPbp(N_i=N_d,N_h=5,N_o=N_o,f=sigmoid,fo=linear) self.f = f # NON-LINEARITY (OPTIONAL) # self.h = RLS(N_d,N_o) self.learn_rate = 0.1 self.w = zeros(N_d) # random.randn(N_d) * 0.1 # the state (actually a vector of weights) self.t = 0 # temperature self.threshold = threshold # deperation threshold self._y = -100.0 # old reward/error/target self.f_rwd = f_rwd self.burn_in = 5
def kNN_Validate(dataName, grpName, folds, k=3, d=2, trans=None): """ params: dataName := file with the data set grpName := file with the different groupings folds := number of folds k := number of neigbors to base the classification off of where the default is 3 d := the minkowski distance to use, default is 2 trans := transformation function to be applied on data set objective: performs cross validation using kNN as classifier eturns: a list of tuples organized as (test_predicted, test_groundTruth) """ valid = vd.Validate(grpName, folds) data, labels = bd(dataName) results = [] #stores tuples: (list_predicted, list_groundTruth) for i in range(valid.getFoldCount()): print("kNN iteration %d" % i) #get the train and test indices of the data set testIndex, trainIndex = valid.getTest(i), valid.getTrain(i) #build the test set and test labels testSet, testLabels = data[testIndex, :], labels[testIndex] #build the train set and training labels trainSet, trainLabels = data[trainIndex, :], labels[trainIndex] #if the data is to be transformed if trans is not None: if trans is fld: tmp = trans(trainSet, trainLabels) trainSet = np.matmul(trainSet, tmp) trainSet = trainSet.reshape(-1, 1).astype(np.float64) testSet = np.matmul(testSet, tmp) testSet = testSet.reshape(-1, 1).astype(np.float64) else: tmp = trans(trainSet).transpose() trainSet = np.matmul(trainSet, tmp) testSet = np.matmul(testSet, tmp) #standardize the training and test set trainSet, testSet = standard(trainSet, testSet) #classify test set and add it to the results list results.append((knn.kNN(trainSet, testSet, trainLabels, k, d), testLabels)) results = ev.buildConfusionMatrices(results) results = ev.normalizeConfMat(results) results = ev.getAvgProbMatrix(results) print("knn results", results) results = ev.rocData(results) print("%d-NN Accuracy: %f" % (k, results["Acc"])) return results
def partA(): TRAIN_TITLES, toss = load_titles() knn = kNN.kNN(10, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.train") n = TRAIN_TITLES.index( 'title-Fifty Shades of Grey: Book One of the Fifty Shades Trilogy\n') m = TRAIN_TITLES.index('title-Brains: A Zombie Memoir\n') x = knn.test_num(n) y = knn.test_num(m) a = [TRAIN_TITLES[n[0]] for n in x] b = [TRAIN_TITLES[n[0]] for n in y] print "Fifty Shades of Grey is most similar to:" print string.join(a) print "Brains: A Zombie Memoir is most similar to:" print string.join(b)
def recommend_songs(self, songnumber): if use_pca: # Get reduced (2 dimensions) data using PCA # start_time() self.transformed = PCA(self.X) # stop_time("PCA") elif use_lem: # Get reduced (2 dimensions) data using LEM # start_time() self.transformed = LEM(self.X) # stop_time("LEM") # Get seed data point self.p = self.transformed[songnumber] # Get 20 nearest neighbors of seed self.idx = kNN(self.transformed, self.p, 20)[0]
def partA(): TRAIN_TITLES, toss = load_titles() knn = kNN.kNN(10, kNN.cosine_similarity) knn.train("PS1_data/books.train") knn.load_test_file("PS1_data/books.train") n = TRAIN_TITLES.index( 'title-Fifty Shades of Grey: Book One of the Fifty Shades Trilogy\n' ) m = TRAIN_TITLES.index('title-Brains: A Zombie Memoir\n') x = knn.test_num(n) y = knn.test_num(m) a = [TRAIN_TITLES[n[0]] for n in x] b = [TRAIN_TITLES[n[0]] for n in y] print "Fifty Shades of Grey is most similar to:" print string.join(a) print "Brains: A Zombie Memoir is most similar to:" print string.join(b)
def knn_worker(q, fold, data_set): # get the ten folds tenFolds = experimental_data_sets[data_set][3] # pick the fold according to this run test = copy.deepcopy(tenFolds[fold]) #Append all data folds to the training data set remaining_folds = [x for i, x in enumerate(tenFolds) if i != fold] training = np.concatenate(remaining_folds) data_dimension = len(experimental_data_sets[data_set][0]) - 1 if feature_data_types[data_set] != "mixed": alpha = 1 beta = 1 else: alpha = 1 beta = alpha * tuned_delta_value[data_set] knn = kNN.kNN( #Feed in the square root of the length tuned_k[data_set], # supply mixed, real, categorical nature of features feature_data_types[data_set], #Feed in the categorical attribute indicies stored in a global array categorical_attribute_indices[data_set], #Store the data set key for the dataset name regression_data_set[data_set], # weight for real distance alpha=alpha, # weight for categorical distance beta=beta, # kernel window size h=tuned_bin_value[data_set], #Set the dimensionality of the data set in KNN d=data_dimension) # Classify the fold against the remaining training data classifications = knn.classify(training, test) metadata = ["KNN", data_set] # calculate the performance results_set = results.LossFunctionPerformance( regression_data_set[data_set], classifications) data_point = metadata + results_set data_point_string = ','.join([str(x) for x in data_point]) # queue the results and return them q.put(data_point_string) return (data_point_string)
def cNN(dataPool, labelPool): prototypePool = list() protoLabelPool = list() # put the first data points in the STORE prototypePool.append(dataPool.pop()) protoLabelPool.append(labelPool.pop()) for index, data in enumerate(dataPool): dataLabel = labelPool[index] nearestPrototypeLabel = kNN(data, prototypePool, protoLabelPool, 1) if nearestPrototypeLabel != dataLabel: prototypePool.append(dataPool[index]) protoLabelPool.append(labelPool[index]) return prototypePool, protoLabelPool
def main(): # training parameter is_sklearn = True k = 10 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # training_data[0] = preprocessing.scale(training_data[0]) # start training training_errs = [] testing_errs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) for i in (0,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) kernel = c.EUCLIDEAN # kernel = c.GAUSSIAN f_select = True best_features_num = 5 clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num) print("Best features: {}".format(clf.best_f_indices)) for kk in (1, 2, 3, 7): tr_pred = clf.predict(tr_data[0], k=kk) te_pred = clf.predict(te_data[0], k=kk) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
def main(): # training parameter k = 8 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0]) # start training training_accs = [] testing_accs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) kernel = c.EUCLIDEAN sst = time.time() for i in (1,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (2.5, 2.7): clf = kNN.kNN(kernel=kernel) # clf.fit(training_data[0], training_data[1]) clf.fit(tr_data[0], tr_data[1]) # tr_pred = clf.predict(training_data[0], r=r) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0] tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] testing_accs.append(te_acc) print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
def main(): fop = Files() #获取训练数据 ret, digitsVec, labelVec = fop.txt2mat('digits/trainingDigits') knnClass = kNN() path = 'digits/testDigits' testList = os.listdir(path) total = len(testList) errCnt = 0 for index in range(total): name = testList[index] label = name.split('.')[0][0] tmp = path + '/' + name digits = fop.readMat(tmp, 32, 32) result = knnClass.classification(digits, digitsVec, labelVec, 3) if label != result: errCnt += 1.0 err = errCnt / total return
def main(): classifier = kNN() dataset = get_dataset() dataset = list(form(dataset)) random.shuffle(dataset) training = dataset[:100] testing = dataset[100:] classifier = train(classifier, training) before = classifier.size classifier.compress(3.0) after = classifier.size correct = 0 for feature, label in testing: seems = classifier.classify(feature) if seems == label: correct += 1 print(correct/len(testing), before, after)
def predict(symbImgs, topology): # Flatten using knn first knn = kNN.kNN() flat_syms = knn.flatten_symbols(symbImgs) binary_symbs = preprocessor.binarize(flat_syms) flattened = logistic_regression.flatten(binary_symbs) knn.load_MNIST() m_data = logistic_regression.flatten(knn.mnist_data) m_labels = knn.mnist_labels nn = neural_network.NeuralNet(topology) print("> Fitting NN") m_labels = toArray(m_labels[:10000]) nn.fit(m_data[:10000], m_labels) results = [] print("> Predicting NN") for i in range(1, len(flattened) + 1): if i % 1000 == 0: print(i) results.append(nn.predict(flattened[i - 1:i])) return results
def APS(): #Takes a posted parameter of format: #{"lid":location_id, "APS":[ {"MAC":MAC, "strength":STRENGTH, "std": STD},... ]} #TODO: CHANGE NAME OF THIS OR /aps lid = -1 try: data = request.get_json(force=True) #TODO: REMOVE FORCE IF POSSIBLe lid = int(data['lid']) if lid < 1: from kNN import kNN, AccessPoint from datetime import datetime knnData = {} APS = {} for item in data["APS"]: if 'std' in item: APS[item['MAC']] = AccessPoint( (item['MAC'], float(item['strength']), float(item['std']), datetime.now(), 10) ) else: APS[item['MAC']] = AccessPoint( (item['MAC'], float(item['strength']), 0, datetime.now(), 10) ) (x, y,floor) = kNN(APS) # cur.execute("""INSERT into demhoes (x,y, recorded) # VALUES ( %s, %s, NOW() )""", [x,y]) #UTC TIME return json.dumps({'success': {"x" : x, "y" : y, "floor":floor}}) else: cur.execute("""SELECT count(*) from accesspoint where location_id=%s""",[lid]) count = cur.fetchone()[0] if not count or int(count) == 0: #Will only log new data -- if already logged will ignore for item in data["APS"]: if 'std' in item: cur.execute("""INSERT into accesspoint (MAC, strength, location_id, std_dev, recorded) VALUES ( %s, %s, %s,%s, NOW() )""", [item['MAC'], float(item['strength']),lid, float(item['std'])] ) #UTC TIME else: cur.execute("""INSERT into accesspoint (MAC, strength, location_id, std_dev, recorded) VALUES ( %s, %s, %s,%s, NOW() )""", [item['MAC'], float(item['strength']),lid, -1] ) #UTC TIME except Exception, e: handle_exception(e) return ERROR_RETURN
def main(): ckNN = kNN(enableLoggingTime=True) #load data ckNN.loadData(fileName='../data/creditcard.csv', feaRowEnd=30808) #ckNN.dataConvertToNumpy() #split train and test by 80-20 ratio ckNN.testTrainSplit(ckNN.feature, ckNN.Class, test_size=0.3) #load model ckNN.loadModel(n_neighbors=3) print(ckNN.toString()) #train model ckNN.trainModel(featureTrain=ckNN.fTrain, classTrain=ckNN.cTrain) #test model ckNN.classPred = ckNN.testModel(featureTest=ckNN.fTest) #metrices accuracy, matConf, matCohenKappa, \ strClassificationReport = ckNN.getMetrics( classTest = ckNN.cTest, classPred = ckNN.classPred, boolPrint = True) # cmap figure generation for confusion matrix ckNN.printConfusionMatrix(matConf)
def LEM(X, ndim=2, k=4): N = X.shape[0] # Number of data vectors d = X.shape[1] # Number of dimensions # Check if ndim is larger or equal to current dimensions if ndim >= d: ndim = d - 1 # Create adjacencey matrix with weights W = np.zeros((N, N)) for i in range(N): idx, eucdst = kNN(X[i], X, k) # Get k nearest neighbours and distance array for j in range(k): # Weight with heat e**(||x1-x2||**2/t) where t = 200 heat = np.exp(-eucdst[idx[j]]**2 / 200) W[i, idx[j]] = heat W[idx[j], i] = heat # # Alternative: weight with {0,1} # W[i, idx[k]] = 1 # W[idx[k], i] = 1 # Create diagonal weight matrix (with column sums of W) D = np.diag(W.sum(axis=1)) # Create laplacian matrix L = D - W # Get eigenvalues and eigenvectors of laplacian matrix (use linalg.eigh since L is symmetric) eigval, eigvec = np.linalg.eigh(L) eigval = np.real(eigval) eigvec = np.real(eigvec) # Get array of eigenvalue indices sorted by smallest values in eigenvalue array index = eigval.argsort() # Return embedded matrix (ignore first eigenvector since its constant) transformed = eigvec[:, index[1:ndim + 1]] return transformed
def tune_knn_parallel_worker(q, data_set: str, k_value: int, delta_value: int, bin_value: int): # print('inside function', data_set, k_value) data_dimension = tuning_data_dict[data_set].shape[1] - 1 if feature_data_types[data_set] != "mixed": alpha = 1 beta = 1 else: alpha = 1 beta = alpha * delta_value # print("this far", data_set) knn = kNN.kNN( #k value k_value, # supply mixed, real, categorical nature of features feature_data_types[data_set], #Feed in the categorical attribute indicies stored in a global array categorical_attribute_indices[data_set], #Store the data set key for the dataset name regression_data_set[data_set], # weight for real distance alpha, # weight for categorical distance beta, # kernel window size bin_value, #Set the dimensionality of the data set in KNN data_dimension) classifications = knn.classify(tuning_data_dict[data_set], tuning_data_dict[data_set]) metadata = [data_set, k_value, beta / alpha, bin_value] results_set = results.LossFunctionPerformance( regression_data_set[data_set], classifications) data_point = metadata + results_set data_point_string = ','.join([str(x) for x in data_point]) # put the result on the multiprocessing queue q.put(data_point_string)
def handwritingClassTest(): hwLabels = [] trainingFileList = os.listdir('2.KNN/trainingDigits') m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] classNumStr = int(fileNameStr.split('.')[0].split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('2.KNN/trainingDigits/%s' % fileNameStr) testFileList = os.listdir('2.KNN/testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] classNumStr = int(fileNameStr.split('.')[0].split('_')[0]) vector = img2vector('2.KNN/testDigits/%s' % fileNameStr) predict = kNN(trainingMat, hwLabels, vector, 3) print "the classifier came back with: %d, the real answer is: %d" % ( predict, classNumStr) if (predict != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount / float(mTest))
def handwriting(): dataset_path = '../../../datasets/Digits/' training_list = os.listdir(os.path.join(dataset_path, "trainingDigits")) test_list = os.listdir(os.path.join(dataset_path, "testDigits")) train_len = len(training_list) test_len = len(test_list) trainingMat = np.zeros((train_len, 1024)) train_labels = [] test_labels = [] for i in range(train_len): splited_path = training_list[i].split('.')[0] label = int(splited_path.split('_')[0]) train_labels.append(label) train_path = os.path.join(dataset_path, "trainingDigits", training_list[i]) trainingMat[i, :] = get_np_image(train_path).reshape(-1) for i in range(test_len): splited_path = test_list[i].split('.')[0] label = int(splited_path.split('_')[0]) test_labels.append(label) test_path = os.path.join(dataset_path, "testDigits", test_list[i]) #print(len(img2vector(test_path)[0])) testVector = get_np_image(test_path).reshape(-1) label_pre = kNN.kNN(trainingMat, testVector, 3, train_labels) print(label_pre, test_labels[i]) print(train_labels) print(test_labels)
def hwClassTrain(): ''' 手写数字识别训练函数 ''' #训练集路径 trainDir = r'D:\VScodePython\机器学习算法\k近邻\手写数字识别\trainingDigits' #读取训练集的所有文件名 trainFileList = listdir(trainDir) #训练集训练样本个数 numOfTrain = len(trainFileList) #初始化训练样本标签 hwLabels = [] #初始化训练集数据 trainMat = np.zeros((numOfTrain, 1024)) for i in range(numOfTrain): fileName = trainFileList[i] #读取数据 trainMat[i, :] = img2vec((trainDir + '\\' + fileName)) #取文件名的第一位为标签 例:0_0.txt hwLabels.append(int(fileName.split('.')[0].split('_')[0])) #全局变量 global model #使用自己编写的kNN model = kNN(trainMat, hwLabels)
"""Read the first 32 characters of the first 32 rows of an image file. @return <ndarray>: a 1x(1024+1) numpy array with data and label, while the label is defaults to 0. """ img = "" for line in open(img_fn).readlines()[:32]: img += line[:32] # labels are always attached at the last position itera = [_ for _ in img + str(label)] return numpy.fromiter(itera, "f4") if __name__ == "__main__": training_set_files = os.listdir(r"./trainingDigits") # initiate a matrix, don't forget to allocate the space for the label # 32 row x 32 col + label training_set = numpy.zeros((len(training_set_files), 32*32+1)) for i in xrange(len(training_set_files)): # e.g. with filename 0_1.txt label is 0 image_file = r"./trainingDigits/" + training_set_files[i] label = training_set_files[i].split('_')[0] training_set[i, :] = img_to_vector(image_file, label) knn = kNN.kNN(3, training_set, False) for fn in os.listdir(r"./testDigits"): print knn.classify(img_to_vector(r"./testDigits/%s" % fn)), ", correct number is %s" % fn.split('_')[0]
xTrain = xDat[train_obs] yTrain = yDat[train_obs] # create val set xVal = xDat[val_obs] yVal = yDat[val_obs] # create test set xTest = xDat[test_obs] yTest = yDat[test_obs] # find hyperparameters that work best on validation set val_accuracies = [] for k in [1,2,3,5,10,20,50,100]: pred = kn.kNN(xTrain,xVal,yTrain,yVal,k) acc = np.mean(pred == yVal.T) print("accuracy: %f" % (acc,)) val_accuracies.append((k, acc)) val_accuracies # tie between 2 and 3, will use 2 (simpler model is better) # test with best working hyperparameters # best is with k = 2 prediction = kn.kNN(xTrain, xTest, yTrain, yTest, k = 5) np.mean(prediction == yTest.T) # attempt at line profiling %load_ext line_profiler %lprun -s -f kn.kNN -T lp_results.txt kn.kNN(xTrain, xTest, yTrain, yTest, k = 2)
def main(): # ================================================ # Load pre-trained model and remove higher level layers # ================================================ print("Loading VGG19 pre-trained model...") base_model = VGG19(weights='imagenet') model = Model(input=base_model.input, output=base_model.get_layer('block4_pool').output) # ================================================ # Read images and convert them to feature vectors # ================================================ imgs, filename_heads, X = [], [], [] path = "db" print("Reading images from '{}' directory...\n".format(path)) for f in os.listdir(path): # Process filename filename = os.path.splitext(f) # filename in directory filename_full = os.path.join(path,f) # full path filename head, ext = filename[0], filename[1] if ext.lower() not in [".jpg", ".jpeg"]: continue # Read image file img = image.load_img(filename_full, target_size=(224, 224)) # load imgs.append(np.array(img)) # image filename_heads.append(head) # filename head # Pre-process for model input img = image.img_to_array(img) # convert to array img = np.expand_dims(img, axis=0) img = preprocess_input(img) features = model.predict(img).flatten() # features X.append(features) # append feature extractor X = np.array(X) # feature vectors imgs = np.array(imgs) # images print("imgs.shape = {}".format(imgs.shape)) print("X_features.shape = {}\n".format(X.shape)) # =========================== # Find k-nearest images to each image # =========================== n_neighbours = 5 + 1 # +1 as itself is most similar knn = kNN() # kNN model knn.compile(n_neighbors=n_neighbours, algorithm="brute", metric="cosine") knn.fit(X) # ================================================== # Plot recommendations for each image in database # ================================================== output_rec_dir = os.path.join("output", "rec") if not os.path.exists(output_rec_dir): os.makedirs(output_rec_dir) n_imgs = len(imgs) ypixels, xpixels = imgs[0].shape[0], imgs[0].shape[1] for ind_query in range(n_imgs): # Find top-k closest image feature vectors to each vector print("[{}/{}] Plotting similar image recommendations for: {}".format(ind_query+1, n_imgs, filename_heads[ind_query])) distances, indices = knn.predict(np.array([X[ind_query]])) distances = distances.flatten() indices = indices.flatten() indices, distances = find_topk_unique(indices, distances, n_neighbours) # Plot recommendations rec_filename = os.path.join(output_rec_dir, "{}_rec.png".format(filename_heads[ind_query])) x_query_plot = imgs[ind_query].reshape((-1, ypixels, xpixels, 3)) x_answer_plot = imgs[indices].reshape((-1, ypixels, xpixels, 3)) plot_query_answer(x_query=x_query_plot, x_answer=x_answer_plot[1:], # remove itself filename=rec_filename) # =========================== # Plot tSNE # =========================== output_tsne_dir = os.path.join("output") if not os.path.exists(output_tsne_dir): os.makedirs(output_tsne_dir) tsne_filename = os.path.join(output_tsne_dir, "tsne.png") print("Plotting tSNE to {}...".format(tsne_filename)) plot_tsne(imgs, X, tsne_filename)
# lets take only first two columns X = data.iloc[:, :2].values y = data.iloc[:, -1] # convert to floats 0,1,2 y = y.apply(lambda x: 0 if x == 'Iris-setosa' else x) y = y.apply(lambda x: 1 if x == 'Iris-versicolor' else x) y = y.apply(lambda x: 2 if x == 'Iris-virginica' else x) y = y.values n_neighbors = 10 # ====================================== # my kNN cl = kNN(n_neighbors) cl.fit(X, y) # ====================================== # scikit-learn clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') clf.fit(X, y) # Plot decision boundary x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
arrIndex[1] = n return arrIndex # Run kNN algorithm k = 1 predictedDigits = zeros(testData.shape[0]) digits = zeros(testData.shape[0]) Label_start_stop = zeros((testData.shape[0],2)) Predicted_start_stop = zeros((testData.shape[0],2)) Error_start_stop = zeros((testData.shape[0],2)) for i in range(testData.shape[0]): print "Current Test Instance: " + str(i+1) print "test data " + str(testData[i]) + "\nLabels " + str(testLabels[i]) digits[i] = i predictedDigits[i] = kNN(k, trainingData, trainingLabels, testData[i,:]) print "Predicted " + str(predictedDigits[i]) arrayIndex_Label = time_startstop(testLabels[i]) Label_start_stop[i,0] = arrayIndex_Label[0] Label_start_stop[i,1] = arrayIndex_Label[1] arrayIndex_Predicted = time_startstop(predictedDigits[i]) Predicted_start_stop[i,0] = arrayIndex_Predicted[0] Predicted_start_stop[i,1] = arrayIndex_Predicted[1] Error_start_stop[i,0] = abs(arrayIndex_Label[0] - arrayIndex_Predicted[0]) Error_start_stop[i,1] = abs(arrayIndex_Label[1] - arrayIndex_Predicted[1]) print "start " + str(arrayIndex_Predicted[0]) + " stop " + str(arrayIndex_Predicted[1]) #print str(i) + " " + str(arrayIndex_Label[0]) + " " + str(arrayIndex_Label[1]) + " " + str(arrayIndex_Predicted[0]) + " " + str(arrayIndex_Predicted[1]) + " " + str(abs(arrayIndex_Label[0] - arrayIndex_Predicted[0])/2.0) + " " + str(abs(arrayIndex_Label[1] - arrayIndex_Predicted[1])/2.0) #plot the Predicted label along with error f1 = plt.figure() f2 = plt.figure()