class KNeighborsRegressorImpl(): def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): self._hyperparams = { 'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm, 'leaf_size': leaf_size, 'p': p, 'metric': metric, 'metric_params': metric_params, 'n_jobs': n_jobs } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def skcv_knn(coordinates, Xdata, Ydata, number_of_folds, dzradii, k_neighbors, visualization): print("Starting skcv-knn analysis...") # Calculate sorted pairwise distance matrix and indexes performanceTable = np.zeros([len(dzradii),2]) data_distances, data_distance_indexes = distanceMatrix(coordinates) folds = random_folds(len(Ydata), number_of_folds) for rind, dzradius in enumerate(dzradii): print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " + str(dzradii[len(dzradii)-1]) + "m") # Calculate dead zone folds dz_folds = dzfolds(dzradius, folds, data_distances, data_distance_indexes) # Initialize performance variables P = np.zeros(Ydata.shape) for fold_id, dz_fold in enumerate(dz_folds): X_tr = np.delete(Xdata, dz_fold, axis=0) Y_tr = np.delete(Ydata, dz_fold, axis=0) learner = KNeighborsRegressor(n_neighbors=k_neighbors) learner.fit(X_tr, Y_tr) preds = learner.predict(Xdata[dz_fold]) if preds.ndim == 0: P[folds[fold_id]] = preds else: P[folds[fold_id]] = preds[0:len(folds[fold_id])] if visualization: # Check for visualization testcoords = coordinates[folds[fold_id],:] dzcoords = coordinates[dz_fold, :] visualize_skcv(coordinates, testcoords, dzcoords, dzradius) perf = cindex(Ydata, P) performanceTable[rind,0] = dzradius performanceTable[rind, 1] = perf plotRes_skcv(performanceTable, rind, number_of_folds, "K-nn") print("Analysis done.") return performanceTable
def evalOne(parameters): all_obs = [] all_pred = [] # all_obs_train = [] # all_pred_train = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"]) model.fit(trainX, trainY) # train_prediction = model.predict(trainX) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) # all_obs_train.extend(trainY) # all_pred_train.extend(train_prediction) return rmseEval(all_obs, all_pred)[1]
def trainNearestNeighbors(data, columns, targetColumn, parameters): modelColumns = [] for column in columns: if column != targetColumn: modelColumns.append(column) modelData = [] for i in range(0, len(data[targetColumn])): record = [] for column in modelColumns: record.append(data[column][i]) modelData.append(record) model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"]) model.fit (modelData, data[targetColumn]) return NearestNeighborsModel(model, modelColumns)
def lposcv_knn(coordinates, Xdata, Ydata, dzradii, k, visualization): print("Starting lposcv-knn analysis...") # Calculate sorted pairwise distance matrix and indexes performanceTable = np.zeros([len(dzradii), 2]) data_distances, data_distance_indexes = distanceMatrix(coordinates) negcount = len(np.where(Ydata == 0)[0]) folds = lpo_folds(Ydata, negcount) for rind, dzradius in enumerate(dzradii): print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " + str(dzradii[len(dzradii) - 1]) + "m") # Calculate dead zone folds dz_folds = dzfolds(dzradius, folds, data_distances, data_distance_indexes) perfs = [] for dz_fold in dz_folds: X_tr = np.delete(Xdata, dz_fold, axis=0) Y_tr = np.delete(Ydata, dz_fold, axis=0) learner = KNeighborsRegressor(n_neighbors=k) learner.fit(X_tr, Y_tr) preds = learner.predict(Xdata[dz_fold]) if preds[0] > preds[1]: perfs.append(1.) elif preds[0] == preds[1]: perfs.append(0.5) else: perfs.append(0.) if visualization: # Check for visualization testcoords = coordinates[dz_fold[0:2], :] dzcoords = coordinates[dz_fold, :] visualize_lposcv(coordinates, testcoords, dzcoords, dzradius) perf = np.mean(perfs) performanceTable[rind, 0] = dzradius performanceTable[rind, 1] = perf plotRes_lposcv(performanceTable, rind, len(folds), 'knn') print("Analysis done.") return performanceTable
# load the data data = {} columns = [] loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns) all_features = deepcopy(columns) all_features.remove("target") all_features.remove("location") output = open(OUTPUT_DATA_FILE, 'w') output.write("location,observation,prediction\n") for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") model = KNeighborsRegressor(weights="distance", n_neighbors=23, p=2.0) model.fit(trainX, trainY) prediction = model.predict(testX) for i in range(0, len(testY)): output.write(str(location)) output.write(",") output.write(str(testY[i])) output.write(",") output.write(str(prediction[i])) output.write("\n") output.close()
def predict_personality(test_path): #train_path = '/Users/lyulan/TCSS555/Data/Train' train_path = '/data/train/' print 'train folder is: ', train_path # prepare training data personality_dic = {} profile_path = train_path + '/Profile/Profile.csv' with open(profile_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: userid = row['userid'] personality = [] personality.append(float(row['ope'])) personality.append(float(row['con'])) personality.append(float(row['ext'])) personality.append(float(row['agr'])) personality.append(float(row['neu'])) personality_dic[userid] = personality #print personality_dic train_userids = [] train_texts = [] train_personalities = [] #train_filepath = train_path + '/Text' train_filepath = train_path + '/text' train_files = [f for f in listdir(train_filepath) if f.endswith(".txt")] for n in train_files: f = io.open(train_filepath + '/' + n, 'r', encoding='latin-1') text = f.read() userid = n.replace('.txt', '') f.close() # get rid of users without personality data if userid in personality_dic: train_userids.append(userid) train_texts.append(text) train_personalities.append(personality_dic[userid]) # prepare testing data test_userids = [] test_texts = [] #test_filepath = test_path + '/Text' test_filepath = test_path + '/text' test_files = [f for f in listdir(test_filepath) if f.endswith(".txt")] for n in test_files: f = io.open(test_filepath + '/' + n, 'r', encoding='latin-1') text = f.read() userid = n.replace('.txt', '') f.close() test_userids.append(userid) test_texts.append(text) # extract features from training data using tfidf_vectorizer tfidf_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.3, ngram_range=(1,3), stop_words='english',use_idf=True, tokenizer=tokenize) trainX = tfidf_vectorizer.fit_transform(train_texts) print("Training data: n_samples: %d, n_features: %d" % trainX.shape) # extract features from testing data using the same tfidf_vectorizer testX = tfidf_vectorizer.transform(test_texts) print("Testing data: n_samples: %d, n_features: %d" % testX.shape) # get feature names extracted by tfidf_vectorizer feature_names = tfidf_vectorizer.get_feature_names() print(feature_names) result = {} result['userid'] = test_userids for x in range(0, 5): y_train = np.array(train_personalities)[:,x] # reduce features by SelectKBest nkb_features = 50 skb = SelectKBest(score_func=f_regression, k=nkb_features) X_train = skb.fit_transform(trainX, y_train) X_test = skb.transform(testX) # train using KNNRegressor knn = KNeighborsRegressor(n_neighbors=500) knn.fit(X_train, y_train) predicted = knn.predict(X_test) if x==0: result['ope'] = predicted elif x==1: result['con'] = predicted elif x==2: result['ext'] = predicted elif x==3: result['agr'] = predicted else: result['neu'] = predicted result_df = pd.DataFrame(result, columns=['userid', 'ope', 'con', 'ext', 'agr', 'neu']) return result_df
from ex30.ex30_lib_graph import plot2 from sklearn.neighbors.regression import KNeighborsRegressor OUTPUT_PNG_FILE = '/experiments/ex30/ex30_knn.png' X = [[float(x)] for x in range(0, 24)] Y = [ 12.0, 13.0, 13.0, 13.0, 28.0, 31.0, 38.0, 60.0, 85.0, 80.0, 64.0, 60.0, 59.0, 58.0, 65.0, 70.0, 80.0, 90.0, 110.0, 100.0, 85.0, 65.0, 45.0, 20.0 ] X2 = [[float(x) / 10.0] for x in range(0, 231)] model = KNeighborsRegressor(n_neighbors=1, p=1) model.fit(X, Y) Y_pred = model.predict(X2) print(str(Y_pred)) plot2(Y, Y_pred, OUTPUT_PNG_FILE, "Observed pollution concentration levels", "Predicted pollution concentration levels by NNR")