def recommend_movies(movie_query, k_recommendations): raw_movies_data = [] with open('Movie_Recommender/movies_recommendation_data.csv', 'r') as md: # Discard the first line (headings) next(md) # Read the data into memory for line in md.readlines(): data_row = line.strip().split(',') raw_movies_data.append(data_row) # Prepare the data for use in the knn algorithm by picking # the relevant columns and converting the numeric columns # to numbers since they were read in as strings movies_recommendation_data = [] for row in raw_movies_data: data_row = list(map(float, row[2:])) movies_recommendation_data.append(data_row) # Use the KNN algorithm to get the 5 movies that are most # similar to The Post. recommendation_indices, _ = knn(movies_recommendation_data, movie_query, k=k_recommendations, distance_fn=euclidean_distance, choice_fn=lambda x: None) movie_recommendations = [] for _, index in recommendation_indices: movie_recommendations.append(raw_movies_data[index]) return movie_recommendations
def main(): x, y = preprocess('student-mat.csv') print(x.shape, y.shape) # 70%的数据用于训练 num_training = int(0.7 * len(x)) num_test = len(x) - num_training x_train = x[:num_training] y_train = y[:num_training] x_test = x[num_training:] y_test = y[num_training:] # 01化 # normalizex(x_train, num_training) normalizey(y_train, num_training) # normalizex(x_test, num_test) normalizey(y_test, num_test) ''' import matplotlib.pyplot as plt temp0x0 = [] temp0x1 = [] temp1x0 = [] temp1x1 = [] for i in range(len(x)): if y[i]: temp1x0.append(x[i][0]) temp1x1.append(x[i][1]) else: temp0x0.append(x[i][0]) temp0x1.append(x[i][1]) plt.plot(temp1x0, temp1x1, 'g^', temp0x0, temp0x1, 'bs') plt.axis([0, 21, 0, 21]) plt.xlabel('x0') plt.ylabel('x1') plt.show() ''' #选择算法 if sys.argv[1] == 'knn': print('KNN') for k in range(1, 21): hit_cnt, f1socre, y_test_pre = knn(k, 2, x_test, y_test, x_train, y_train) print('K = ', k, ', hit rate = ', hit_cnt / num_test, 'f1score = ', f1socre) elif sys.argv[1] == 'svm': print('SVM') normalize0(y_train) normalize0(y_test) testRbf(200, x_train, y_train, x_test, y_test) for C in range(8): hit_cnt, f1socre, y_test_pre = testRbf(200, x_train, y_train, x_test, y_test) print('C = ', C, ', hit rate = ', hit_cnt / num_test, 'f1score = ', f1socre) else: print('Naive Bayes') hit_cnt, f1socre, y_test_pre = bayes(x_train, y_train, x_test, y_test) print('hit rate = ', hit_cnt / num_test, 'f1score = ', f1socre)
def main(): train_file = 'train.txt' test_file = 'test.txt' bayes_accuracy = naive_bayes(train_file, test_file) knn_accuracy = knn(train_file, test_file, k=5) dt_accuracy, tree = decision_tree(train_file, test_file) with open('output.txt', 'w') as f: print_tree(tree, f) f.write('\n{}\t{}\t{}\n'.format(round(dt_accuracy, 2), round(knn_accuracy, 2), round(bayes_accuracy, 2)))
# Creating model objects model = args.model if (model == "baseline"): model_obj = BaseLine(reviews, categories) elif (model == "logreg"): model_obj = LogReg(reviews) elif (model == "multinomialNB"): model_obj = NaiveBayes(reviews, "multinomial") elif (model == "lda"): model_obj = TopicModel(reviews) elif (model == "kNearestNeighbors"): model_obj = knn(reviews, target) else: # put additional models here. print("Argument Error: invalid model specified") sys.exit() model_classified = [] # classifications stored here reviews = [] # resetting reviews list to save memory # Reading test data into reviews list if args.invert == "False": for classifier in categories: with open("spring-" + classifier + ".json") as json_file: for line in json_file: json_obj = json.loads(line) reviews += [(classifier, json_obj)]
KNN_TRAIN = "_SUBMIT_KNN.csv" KNN_TRAIN_BN = "_SUBMIT_KNN_BN.csv" RESULTADO = "Prediccion.csv" bn_transformar(TRAIN,TRAIN_BN) bn_transformar(TEST, TEST_BN) ampliar_set(TRAIN, TRAIN_AMPLIADO) ampliar_set(TRAIN_BN, TRAIN_AMPLIADO_BN) rf(TRAIN_AMPLIADO, TEST, RF_TRAIN_AMPLIADO) rf(TRAIN_AMPLIADO_BN, TEST_BN, RF_TRAIN_AMPLIADO_BN) knn(TRAIN, TEST, KNN_TRAIN) knn(TRAIN_BN, TEST_BN, KNN_TRAIN_BN) submits = [KNN_TRAIN, KNN_TRAIN_BN, RF_TRAIN_AMPLIADO_BN, RF_TRAIN_AMPLIADO] i_mejorPredictor = 0 democratizar(submits, RESULTADO, i_mejorPredictor) b = timeit.default_timer() secs = b - a m, s = divmod(secs,60) m = int(m) s = int(s) print "Fin digit_recognizer (" + str( m) + ":" + str(s ) + ")"
negFeatureFolders.append( [negFeature[j] for j in sequence[:posNum]]) #print(np.array(negFeatureFolders).shape) for i in range(folderNum): subTrainFeature = negFeatureFolders[i] subTrainFeature.extend(posFeature) subTrainFeature = np.array(subTrainFeature) subTrainLabel = list(np.zeros(posNum)) subTrainLabel.extend(list(np.ones(posNum))) subTrainLabel = np.array(subTrainLabel) print("=====%dst Bagging=====") % (i + 1) print("Positive: %d, Negative: %d") % (list(subTrainLabel).count(1), list(subTrainLabel).count(0)) #print(subTrainFeature.shape) #print(subTrainLabel) predictedLabel_temp1 = knn(subTrainFeature, subTrainLabel, testFeature, 5) predictedLabel_temp2 = decision_Tree(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp3 = adboostDT(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp4 = RandomForest_Classifer(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp5 = svmclassifier(subTrainFeature, subTrainLabel, testFeature, 1.0, 0.015625) predictedLabel_temp6 = logistic_regression(subTrainFeature, subTrainLabel, testFeature) predictedLabel_voting1.append(predictedLabel_temp1) predictedLabel_voting2.append(predictedLabel_temp2) predictedLabel_voting3.append(predictedLabel_temp3)
#Set k, tolerance and max interations k = 6 tolerance = 0.0001 # only for k means max_iterations = 300 # only for k means # k-means clustering # get the optimal amount of clusters dataset = np.delete(dataset, -1, axis=1) # delete feature column to run k means clustering #dataset = dataset[:,:-2] km = K_Means(k, tolerance, max_iterations) km.fit(dataset) create_kmeans_csv(km.classes.items()) print( len(km.classes.items()), "clusters created from the k-means algorithm. \n" "Check k-means.csv for more information") # KNN flag = False while not flag: k = input_k("Enter k to run the KNN algorithm!").response if k is 0 or k is "": flag = True print("Didn't execute algorithm") else: k = int(k) print("returned value is:", k) predictions = knn(X_train, X_test, k) create_knn_csv(X_test)
import numpy as np import pandas as pd from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from KNN import knn df = datasets.load_iris() X, y = df.data, df.target X_train, x_test, Y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42) Model = knn() Model.fit(X_train, Y_train) y_preds = Model.predict(x_test) # Model accuracy score print(f"model accuracy score is, {accuracy_score(y_test, y_preds)}")
KNN_TRAIN = "_SUBMIT_KNN.csv" KNN_TRAIN_BN = "_SUBMIT_KNN_BN.csv" RESULTADO = "Prediccion.csv" bn_transformar(TRAIN, TRAIN_BN) bn_transformar(TEST, TEST_BN) ampliar_set(TRAIN, TRAIN_AMPLIADO) ampliar_set(TRAIN_BN, TRAIN_AMPLIADO_BN) rf(TRAIN_AMPLIADO, TEST, RF_TRAIN_AMPLIADO) rf(TRAIN_AMPLIADO_BN, TEST_BN, RF_TRAIN_AMPLIADO_BN) knn(TRAIN, TEST, KNN_TRAIN) knn(TRAIN_BN, TEST_BN, KNN_TRAIN_BN) submits = [KNN_TRAIN, KNN_TRAIN_BN, RF_TRAIN_AMPLIADO_BN, RF_TRAIN_AMPLIADO] i_mejorPredictor = 0 democratizar(submits, RESULTADO, i_mejorPredictor) b = timeit.default_timer() secs = b - a m, s = divmod(secs, 60) m = int(m) s = int(s) print "Fin digit_recognizer (" + str(m) + ":" + str(s) + ")"
for i in range(folderNum): random.shuffle(sequence) negFeatureFolders.append([negFeature[j] for j in sequence[:posNum]]) #print(np.array(negFeatureFolders).shape) for i in range(folderNum): subTrainFeature = negFeatureFolders[i] subTrainFeature.extend(posFeature) subTrainFeature = np.array(subTrainFeature) subTrainLabel = list(np.zeros(posNum)) subTrainLabel.extend(list(np.ones(posNum))) subTrainLabel = np.array(subTrainLabel) print("=====%dst Bagging=====") % (i+1) print("Positive: %d, Negative: %d") % (list(subTrainLabel).count(1), list(subTrainLabel).count(0)) #print(subTrainFeature.shape) #print(subTrainLabel) predictedLabel_temp1 = knn(subTrainFeature, subTrainLabel, testFeature, 5) predictedLabel_temp2 = decision_Tree(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp3 = adboostDT(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp4 = RandomForest_Classifer(subTrainFeature, subTrainLabel, testFeature) predictedLabel_temp5 = svmclassifier(subTrainFeature, subTrainLabel, testFeature, 1.0, 0.015625) predictedLabel_temp6 = logistic_regression(subTrainFeature, subTrainLabel, testFeature) predictedLabel_voting1.append(predictedLabel_temp1) predictedLabel_voting2.append(predictedLabel_temp2) predictedLabel_voting3.append(predictedLabel_temp3) predictedLabel_voting4.append(predictedLabel_temp4) predictedLabel_voting5.append(predictedLabel_temp5) predictedLabel_voting6.append(predictedLabel_temp6) print("KNN=====%dst predicted labels:") % (i+1) print(predictedLabel_temp1) print("DT=====%dst predicted labels:") % (i+1)
# Creating model objects model = args.model if (model == "baseline"): model_obj = BaseLine(reviews, categories) elif (model == "logreg"): model_obj = LogReg(reviews) elif (model == "multinomialNB"): model_obj = NaiveBayes(reviews, "multinomial") elif (model == "lda"): model_obj = TopicModel(reviews) elif (model == "kNearestNeighbors"): model_obj = knn(reviews,target) else: # put additional models here. print("Argument Error: invalid model specified") sys.exit() model_classified = [] # classifications stored here reviews = [] # resetting reviews list to save memory # Reading test data into reviews list if args.invert == "False": for classifier in categories: with open("spring-"+classifier+".json") as json_file: for line in json_file: json_obj = json.loads(line) reviews += [(classifier,json_obj)]
def orient(name, filename, model_file, model): if name == 'train': if model == 'nearest' or model == 'best': train = pd.read_csv(filename, sep=' ', header=None) filename_knn = model_file file = open(filename_knn, 'wb') pickle.dump(train, file) if model == 'nnet': train = pd.read_csv(filename, sep=' ', header=None) x_train = train.drop(columns=[0, 1], axis=1) y_train = train[1] y_train = pd.get_dummies(y_train) y_columns = y_train.columns x_train = x_train.to_numpy() y_train = y_train.to_numpy() print(x_train.shape[0], 'train samples') a = nn(25, 0.001, 0.9) (w1, w2, w3, b1, b2, b3) = a.fit(x_train, y_train) weights = { 'w1': w1, 'w2': w2, 'w3': w3, 'b1': b1, 'b2': b2, 'b3': b3, 'y_columns': y_columns } filename_nn = model_file file = open(filename_nn, 'wb') pickle.dump(weights, file) if model == 'tree': dtreemain(name, filename, model_file) if name == 'test': if model == 'nearest' or model == 'best': file = open(model_file, 'rb') train = pickle.load(file) test = pd.read_csv(filename, sep=' ', header=None) X_test = test.drop(columns=[0, 1], axis=1) y_filenames = test[0] y_test = test[1] X_test = X_test.to_numpy() y_test = y_test.to_numpy() obj = knn(10) ypred = obj.predict(train, X_test) f = open("output.txt", "w") for i in range(len(X_test)): with open('output.txt', 'a') as f: f.write(str(y_filenames[i]) + ' ' + str(ypred[i]) + '\n') if model == 'tree': dtreemain(name, filename, model_file) if model == 'nnet': test = pd.read_csv(filename, sep=' ', header=None) x_test = test.drop(columns=[0, 1], axis=1) y_test = test[1] y_filenames = test[0] y_test = pd.get_dummies(y_test) x_test = x_test.to_numpy() y_test = y_test.to_numpy() print(x_test.shape[0], 'test samples') file = open(model_file, 'rb') new_weights = pickle.load(file) w1f = new_weights['w1'] w2f = new_weights['w2'] w3f = new_weights['w3'] b1f = new_weights['b1'] b2f = new_weights['b2'] b3f = new_weights['b3'] y_columns = new_weights['y_columns'] a = nn(25, 0.001, 0.9) y_test_predicted = a.predict(x_test, w1f, w2f, w3f, b1f, b2f, b3f) zero_one = (y_test_predicted == y_test_predicted.max( axis=1)[:, None]).astype(int) diff = (y_test == zero_one).sum(axis=1) accuracy = np.count_nonzero(diff == y_test.shape[1]) print('accuracy ', accuracy / diff.shape[0] * 100) f = open("Output.txt", "w") for i in range(len(x_test)): with open('Output.txt', 'a') as f: f.write( str(y_filenames[i]) + ' ' + str(y_columns[np.argmax(y_test_predicted[i])]) + '\n')
from KNN import knn trainData = <path to train data> testData = <path to test data> categoricalIndices = <list of indices of any non-numerical rows> test = knn() test.trainDataProcess(trainData, categoricalIndices) test.testDataProcess(testData, True) test.predict(5)
from KNN import knn from RandomForest import randomforest from SupportVector import supportvector knn=knn() b=knn.info() a=knn.kn() rf=randomforest() b=rf.rf() sv=supportvector() c=sv.svc() print("Accuracy of KNN is : "+str(a)) print("Accuracy of RandomForest is : "+str(b)) print("Accuracy of SupportVector is : "+str(c))
def get_recommended_movies(self, data, fav_movie): """Run the k-nearest neighbors algorithm""" recommended_movies = knn(data, fav_movie, k=5) return recommended_movies
data_y.append(row[0]) # | data_x = hp.normalize(data_x) # Normaliza # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # The THING itself # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ratio = 0.40 # Known base % k = 3 # K distance_func = 'euclidean' bound = int(ratio * len(data_y)) # The classification algorithm my_output_labels = knn(x_train=data_x[:bound], y_train=data_y[:bound], x_test=data_x[bound + 1:], distance=distance_func, k=k) print('Objective: ' + str(data_y[bound + 1:])) print('Obtained: ' + str(my_output_labels)) # Matriz de Confusão i = 0 tp, tn, fp, fn = 0, 0, 0, 0 for label in data_y[bound + 1:]: if label == my_output_labels[ i] and label == 'Iris-setosa': # True Positive tp += 1 elif label == my_output_labels[ i] and label == 'Iris-versicolor': # True Negative
while True: imageResp = urlopen(url) imageNp = np.array(bytearray(imageResp.read()), dtype=np.uint8) frame = cv2.imdecode(imageNp, -1) # Convert frame to grayscale gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Detect multi faces in the image faces = face_cascade.detectMultiScale(gray, 1.3, 5) for face in faces: x, y, w, h = face # Get the face ROI offset = 7 face_section = frame[y-offset:y+h+offset, x-offset:x+w+offset] face_section = cv2.resize(face_section, (100, 100)) out = knn(trainset, face_section.flatten()) # Draw rectangle in the original image cv2.putText(frame, names[int(out)],(x,y-10), font, 1,(255,0,0),2, lineType = cv2.CV_AA) cv2.rectangle(frame, (x,y), (x+w,y+h), (0,255,0), 2) cv2.imshow("Faces", frame) if cv2.waitKey(1) & 0xFF == ord('q'): break cv2.destroyAllWindows()
import pandas as pd from DMC import dmc from KNN import knn from NN import nn data = pd.read_csv("iris.csv") testSet = [[7.2, 3.6, 5.1, 2.5]] test = pd.DataFrame(testSet) k = 5 result1, neighbor1 = nn(data, test) result2, neighbor2 = knn(data, test, k) result3, neighbor3 = dmc(data, test) print("\nResultados: ") print("NN\n\tResults: {} - Vizinho: {}".format(result1, neighbor1)) print("KNN\n\tResults: {} - Vizinho: {}".format(result2, neighbor2)) print("DMC\n\tResults: {} - Vizinho(Centroide): {}".format(result3, neighbor3))