def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5): model = KNeighborsRegressor(n_neighbors=n_neighbors) model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) score = model.score(features_train, labels_train) print "KNeighbors - coefficient of determination R^2 of the prediction: ", score return labels_pred
def fill_income(df): income_imputer = KNeighborsRegressor(n_neighbors=2) df_w_monthly_income = df[df.monthly_income.isnull() == False].copy() df_w_null_monthly_income = df[df.monthly_income.isnull() == True].copy() cols = ["number_real_estate_loans_or_lines", "number_of_open_credit_lines_and_loans"] income_imputer.fit(df_w_monthly_income[cols], df_w_monthly_income.monthly_income) new_values = income_imputer.predict(df_w_null_monthly_income[cols]) df_w_null_monthly_income.loc[:, "monthly_income"] = new_values df2 = df_w_monthly_income.append(df_w_null_monthly_income) return df2
def knnPredictor(df): dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df) corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] for k in range(1, 200, 1): knnModel = KNeighborsRegressor(n_neighbors=k) knnModel.fit(dataTrainX, dataTrainY) knnpredicted = knnModel.predict(dataTestX) corelationCoefficient = pearsonr(dataTestY, knnpredicted) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) # plotter.plot(corelationCoefficiantArray) bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) knnModelBest = KNeighborsRegressor(n_neighbors=bestK) knnModelBest.fit(dataTrainX, dataTrainY) print("K = ") print(bestK) print("Corelation Coeff:") print(corelationCoefficiantDictionary[bestK]) knnpredictedBest = knnModelBest.predict(dataTestX) fig, ax = plotter.subplots() corelationCoefficient = pearsonr(dataTestY, knnpredictedBest) print(corelationCoefficient[0]) ax.set_ylabel('Predicted KNN Weekly') ax.scatter(dataTestY, knnpredictedBest) ax.set_xlabel('Measured') plotter.show()
def predictKnn(data, priceToPredict): corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \ data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"] for k in range( 1 , 100 , 1): neigh = KNeighborsRegressor(n_neighbors=k) #n = 7 best fits neigh.fit(openingPriceTrain, closingPriceTrain) closingPriceTestArray = np.reshape(closingPriceTest,-1) knnpr = neigh.predict(openingPriceTest) predictedArray = np.reshape(knnpr,-1) corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) plotter.plot(corelationCoefficiantArray) # plotter.show() bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) neighBest = KNeighborsRegressor(n_neighbors=bestK) neighBest.fit(openingPriceTrain, closingPriceTrain) openingPriceToPredict = np.array([priceToPredict]) print("K = ") print(bestK) print(neighBest.predict(openingPriceToPredict))
class PersonalityPredictor(object): def __init__(self, nn): self.nn = nn self.o_clf = KNeighborsRegressor(n_neighbors=self.nn) self.c_clf = KNeighborsRegressor(n_neighbors=self.nn) self.e_clf = KNeighborsRegressor(n_neighbors=self.nn) self.a_clf = KNeighborsRegressor(n_neighbors=self.nn) self.n_clf = KNeighborsRegressor(n_neighbors=self.nn) self.features = [] self.o_value = [] self.c_value = [] self.e_value = [] self.a_value = [] self.n_value = [] def register(self, data): for user_id in data: if 'f' in data[user_id]: self.o_value.append(self.make_float(data[user_id]['o'])) self.c_value.append(self.make_float(data[user_id]['c'])) self.e_value.append(self.make_float(data[user_id]['e'])) self.a_value.append(self.make_float(data[user_id]['a'])) self.n_value.append(self.make_float(data[user_id]['n'])) self.features.append(data[user_id]['f']) else: break def make_float(self, value): if isinstance(value, basestring): return float(re.sub("[^0-9.]", "", value)) else: return float(value) def train(self): self.features = normalize(self.features) self.o_clf.fit(self.features, self.o_value) self.c_clf.fit(self.features, self.c_value) self.e_clf.fit(self.features, self.e_value) self.a_clf.fit(self.features, self.a_value) self.n_clf.fit(self.features, self.n_value) def predict(self, features): o = self.o_clf.predict([features]).tolist()[0] c = self.c_clf.predict([features]).tolist()[0] e = self.e_clf.predict([features]).tolist()[0] a = self.a_clf.predict([features]).tolist()[0] n = self.n_clf.predict([features]).tolist()[0] return { 'o': o, 'c': c, 'e': e, 'a': a, 'n': n }
def add_geo(self): model = KNeighborsRegressor(n_neighbors=1) x = self.zip[['Lat', 'Long']].values derived = ['TaxReturnsFiled', 'EstimatedPopulation', 'EstWages', 'DependencyRatio'] y = self.zip[derived].values model.fit(x, y) train_feats = model.predict(self.train[['latitude', 'longitude']].values) test_feats = model.predict(self.test[['latitude', 'longitude']].values) tr = pandas.DataFrame(train_feats, columns=derived) te = pandas.DataFrame(test_feats, columns=derived) self.merge(tr, self.train) self.merge(te, self.test)
def predict_missing_data_for_column(features,missing_column,params,num_boost_round,test_size,train_file_name,test_file_name): print("## Train a XGBoost model for filling missing column : " + str(missing_column)) X_missing_data_train = train[train[missing_column].isnull()] X_missing_data_test = test[test[missing_column].isnull()] X_data_train = train[np.isfinite(train[missing_column])] X_data_test = test[np.isfinite(test[missing_column])] X_data = pd.concat([X_data_train,X_data_test]) X_data = X_data.iloc[np.random.permutation(len(X_data))] #print(X_missing_data[missing_column]) #print(X_data[missing_column]) y_data = X_data[missing_column] # -- Replacing with KNN -- # dtrain = xgb.DMatrix(X_train[features], y_train) # dvalid = xgb.DMatrix(X_valid[features], y_valid) # # watchlist = [(dtrain, 'train'),(dvalid, 'eval')] # fgbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=True) # # print("## Predicting missing data for column : " + str(missing_column)) # # if not X_missing_data_train.empty: # fpreds = fgbm.predict(xgb.DMatrix(X_missing_data_train[features]),ntree_limit=fgbm.best_ntree_limit) # train.loc[train[missing_column].isnull(),missing_column] = fpreds # # if not X_missing_data_test.empty: # fpreds = fgbm.predict(xgb.DMatrix(X_missing_data_test[features]),ntree_limit=fgbm.best_ntree_limit) # test.loc[test[missing_column].isnull(),missing_column] = fpreds # -------------------------- ngbr = KNeighborsRegressor() ngbr.fit(X_data[features], y_data) # print("## Predicting missing data for column : " + str(missing_column)) if not X_missing_data_train.empty: fpreds = ngbr.predict(X_missing_data_train[features]) train.loc[train[missing_column].isnull(),missing_column] = fpreds if not X_missing_data_test.empty: fpreds = ngbr.predict(X_missing_data_test[features]) test.loc[test[missing_column].isnull(),missing_column] = fpreds # train.to_csv(train_file_name, index=False) # test.to_csv(test_file_name, index=False) print("##########################################################################################################################")
def neighbors_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False): model = KNeighborsRegressor(n_neighbors=25) if use_cache: fhand = open(cache_name, 'r') data_dict = pickle.load(fhand) return data_dict['test_pred'], data_dict['valid_pred'] np.random.seed(seed=123) model.fit(x_train, np.log(y_train)) test_pred = np.exp(model.predict(x_test)) valid_pred = np.exp(model.predict(x_valid)) data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred} fhand = open(cache_name, 'w') pickle.dump(data_dict, fhand) fhand.close() return test_pred, valid_pred
def kNN(X_train, y_train, X_test, y_test, uselog=False): ''' :param X_train: :param y_train: :param X_test: :param y_test: :return: ''' scaler = StandardScaler() print X_train.shape print X_test.shape X = scaler.fit_transform(X_train) test = scaler.transform(X_test) clf = KNeighborsRegressor(n_neighbors=550) clf.fit(X, y_train) result = clf.predict(test) if uselog: result = map(lambda x: math.log(1 + x), result) return result
def transform(self, X, y=None): """ :param X: multidimensional numpy array like. """ rows, features = X.shape mask = list(map(lambda x: reduce(lambda h, t: h or t, x), np.isnan(X))) criteria_for_bad = np.where(mask)[0] criteria_for_good = np.where(mask == np.zeros(len(mask)))[0] X_bad = X[criteria_for_bad] X_good = X[criteria_for_good] knn = KNeighborsRegressor(n_neighbors=self.k) for idx, x_bad in zip(criteria_for_bad.tolist(), X_bad): missing = np.isnan(x_bad) bad_dim = np.where(missing)[0] good_dim = np.where(missing == False)[0] for d in bad_dim: x = X_good[:, good_dim] y = X_good[:, d] knn.fit(x, y) X[idx, d] = knn.predict(x_bad[good_dim]) return X
def __init__(self,dataFrame): self.dataFrameKNN = {} self.KNNWeightage = {'Avg-High Ratio':100,'Avg-Low Ratio':100,'Deliverable Qty':300,'Turnover':100,'Growth':150,'Trend':100,'Output':100} self.valid = True self.KNNModelHash = {} self.dataFrameKNN = pd.DataFrame() self.dataFrameKNN['Avg-High Ratio'] = dataFrame['High Price'][1:] - dataFrame['Average Price'][1:] self.dataFrameKNN['Avg-Low Ratio'] = dataFrame['Average Price'][1:] - dataFrame['Low Price'][1:] self.dataFrameKNN['Deliverable Qty'] = dataFrame['Deliverable Qty'][1:] self.dataFrameKNN['Turnover'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Growth'] = dataFrame['Close Price'][1:]-dataFrame['Prev Close'][1:] self.dataFrameKNN['Trend'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Output'] = dataFrame['High Price'][1:]-dataFrame['Prev Close'][1:] self.KNNModelHash['mean'] = self.dataFrameKNN['Output'].mean() self.KNNModelHash['std'] = self.dataFrameKNN['Output'].std() for key in self.dataFrameKNN: self.normalizeKNNModel(key) #trainData has the data to be trained, but the last data is the testData trainData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][:-1].values testData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][-1:].values #trainOutput contains the output corresponding to train Data but the first one is garbage trainOutput = self.dataFrameKNN['Output'][1:].values KNNModel = KNeighborsRegressor(n_neighbors=3,weights = 'distance') KNNModel.fit(trainData[100:400],trainOutput[100:400]) prediction = KNNModel.predict(trainData[400:450]) weightage = self.KNNWeightage['Output'] for i in range(50): prediction[i] = ((prediction[i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage trainOutput[400+i] = ((trainOutput[400+i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage print "%-40s %-40s " %(prediction[i],trainOutput[400+i])
def calc_linear_regression(reg_training_path): dataset = read_reg_train_data(reg_training_path) rmse = 0 n_folds = 5 folds = KFold(n=len(dataset), n_folds=n_folds, shuffle=False) fold = 0 for train_indices, test_indices in folds: fold += 1 training_set = [dataset[i] for i in train_indices] test_set = [dataset[i] for i in test_indices] training_dataframe = get_data_frame(training_set) test_dataframe = get_data_frame(test_set) column_names = ['cf_item', 'cf_user', 'svd', 'content_item', 'actual_rating'] training_dataframe.columns = column_names test_dataframe.columns = column_names actual_rating_training_column = training_dataframe['actual_rating'] #actual_rating_test_column = test_dataframe['actual_rating'] training_dataframe = training_dataframe.drop('actual_rating', axis=1) test_dataframe = test_dataframe.drop('actual_rating', axis=1) neigh = KNeighborsRegressor(n_neighbors=10) #print('Initialized k nearest neighbors regressor with k =', i) neigh.fit(training_dataframe, actual_rating_training_column) #print('Fit data models') predict_set = neigh.predict(test_dataframe) print(predict_set) rmse += mean_squared_error([rec[4] for rec in test_set], [rec for rec in predict_set]) ** 0.5 print("Fold (%d) finished with accumulated RMSE of (%f) (%s)" % (fold, rmse, time.strftime('%y_%m_%d_%H_%M_%S'))) return rmse / float(n_folds)
def smooth(self, X, y): # KNN algorithm for smooth nbrs = KNeighborsRegressor(n_neighbors = 20) X = X.reshape(-1, 1) nbrs.fit(X, y) proba = nbrs.predict(X) return proba
def knn_model(train, y_train, test): model = KNeighborsRegressor(n_neighbors = 10, weights='distance', n_jobs=-1) model.fit(train, y_train) test_probs = model.predict(test) indices = test_probs < 0 test_probs[indices] = 0 return test_probs
def run_network(mdl=None, data=None): global_start_time = time.time() sequence_length = 10 if data is None: print('Loading data... ') X_train, y_train, X_test, y_test = train_test_traffic_data(15773, sequence_length) else: X_train, y_train, X_test, y_test = data print('\nData Loaded...\n') if mdl is None: mdl = KNeighborsRegressor(5, weights='distance') try: mdl.fit(X_train, y_train) predicted_trffic = mdl.predict(X_test) except KeyboardInterrupt: print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, 0 print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, predicted_trffic
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'n_neighbors': [2, 5, 10, 15]}] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): # pdb.set_trace() X_train, y_train = stock.get_data(start_date, mid_date, fit=True) X_cv, y_cv = stock.get_data(mid_date, end_date) lowest_mse = np.inf for i, param in enumerate(params): knn = KNeighborsRegressor(**param) # ada = AdaBoostRegressor(knn) knn.fit(X_train.values, y_train.values) mse = mean_squared_error(y_cv, knn.predict(X_cv.values)) if mse <= lowest_mse: self.models[ticker] = knn return self
def opt_ex1(): from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(12, weights='distance') #Train the KNN knn.fit(mag_train, z_train) #Test it! z_fit_train = knn.predict(mag_train) z_fit = knn.predict(mag_test) #Compute rms in the training set and test set rms_train = np.mean(np.sqrt((z_fit_train - z_train) ** 2)) rms_test = np.mean(np.sqrt((z_fit - z_test) ** 2)) plt.scatter(z_test,z_fit, color='k', s=0.1) plt.plot([-0.1, 6], [-0.1, 6], ':k') plt.text(0.04, 5, "rms = %.3f" % (rms_test)) plt.xlabel('$z_{true}$') plt.ylabel('$z_{fit}$')
def main(featureFile, outputfolder): with open(featureFile, 'r') as csvfile: my_data = pd.read_csv(csvfile, delimiter="\t", low_memory=False) random_indices = permutation(my_data.index) # how many time do we want the data in our test set? test_cutoff = math.floor(len(my_data)/3) test = my_data # Generate the training set with the rest of the data. train = my_data.loc[random_indices[test_cutoff:]] x_columns = ["Row"=="1", "Student ID"=="2", "Problem Hierarchy" == "3", "Problem Name"=="4", "Problem View" == "5", "Step Name" == "6", "KC(Default)"=="7", "Opportunity (Default)" == "8"] x_columns = [int(i) for i in x_columns] # y columns show the predicted feature, in this case, the correct first attempt y_column = ["Correct First Attempt"] # Look at the Ten closest neighbors, to offset potential noise in the data knn = KNeighborsRegressor(n_neighbors=10) knn.fit(train[x_columns], train[y_column]) # Make point predictions on the test set using the fit model. predictions = knn.predict(test[x_columns]) actual = test[y_column] result = test[['Anon Student Id','Correct First Attempt']] result.to_csv(outputfolder, sep='\t') # Compute the root mean squared error of our predictions. rmse = math.sqrt((((predictions - actual) ** 2).sum()) / len(predictions)) print('RMSE=') print(rmse)
def apply_knn(): regr = KNeighborsRegressor() regr.fit(Xtr, Ytr) pred = regr.predict(Xte) temp = mean_squared_error(Yte, pred) return pred, temp
def Round2(X, y): # Set parameters min_score = {} for neigh in [5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]: model = KNeighborsRegressor(n_neighbors=neigh) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['neighbor'] = neigh min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['neighbor'] = neigh min_score['scores'] = scores print "Neighbors:", neigh print scores print np.mean(scores) return min_score
def run_kNeighbors(distances, loadings, test_vars, weightings=('uniform',), k_list=(3)): """ Run Knearest neighbor using precomputed distances to create an ontological mapping Args: distances: square distance matrix to pass to KNeighborsRegressors loadings: loading matrix for training test_vars: variable to reconstruct weightings: (optional) list of weightings to pass to KNeighbors k_list: list of k values to pass to KNeighbors as n_neighbors """ train_distances = distances.loc[loadings.index, loadings.index] test_distances = distances.loc[test_vars, loadings.index] to_return = pd.DataFrame() for weighting in weightings: for k in k_list: clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting) clf.fit(train_distances, loadings) out = clf.predict(test_distances) out = pd.DataFrame(out, columns=loadings.columns) out['var'] = test_vars out['k'] = k out['weighting'] = weighting # add neighbors and distances neighbors = clf.kneighbors(test_distances) out['distances'] = tuple(neighbors[0]) out['neighbors'] = tuple(test_distances.columns[neighbors[1]]) to_return = pd.concat([to_return, out], sort=False) return to_return
def predictDayType (self,week,day): knn = KNeighborsRegressor(n_neighbors=5) knn.fit(self.rawData, self.dayType) X = np.array([week,day]) predictions = knn.predict(X) return predictions
def nnVerify_2(city_data,x,y): """ Using SKLearn's KNeighborsRegressor """ X,Y = city_data.data, city_data.target clf = KNeighborsRegressor(n_neighbors=2) clf.fit(X,Y) y_pred = clf.predict(x) print("KNeighborsRegressor") print("Y pred(KNN) : ", y_pred)
def main(): # read the images image_from = io.imread(name_from) / 256 image_to = io.imread(name_to) / 256 # change to hsv domain (if requested) if args.use_hsv: image_from[:] = rgb2hsv(image_from) image_to[:] = rgb2hsv(image_to) # get shapes shape_from = image_from.shape shape_to = image_to.shape # flatten X_from = im2mat(image_from) X_to = im2mat(image_to) # number of pixes n_pixels_from = X_from.shape[0] n_pixels_to = X_to.shape[0] # subsample X_from_ss = X_from[np.random.randint(0, n_pixels_from-1, n_pixels),:] X_to_ss = X_to[np.random.randint(0, n_pixels_to-1, n_pixels),:] if save_col_distribution: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') fig, axes = plt.subplots(nrows=2, figsize=(5, 10)) for ax, X in zip(axes, [X_from_ss, X_to_ss]): ax.scatter(X[:,0], X[:,1], color=X) if args.use_hsv: ax.set_xhsvel('hue') ax.set_yhsvel('value') else: ax.set_xhsvel('red') ax.set_yhsvel('green') axes[0].set_title('distr. from') axes[1].set_title('distr. to') fig.tight_layout() fig.savefig('color_distributions.png') # optimal tranportation ot_color = OptimalTransport(X_to_ss, X_from_ss, lam=lam, distance_metric=distance_metric) # model transfer transfer_model = KNeighborsRegressor(n_neighbors=n_neighbors) transfer_model.fit(X_to_ss, n_pixels * ot_color.P @ X_from_ss) X_transfered = transfer_model.predict(X_to) image_transferd = minmax(mat2im(X_transfered, shape_to)) if args.use_hsv: image_transferd[:] = hsv2rgb(image_transferd) io.imsave(name_out, image_transferd)
class Knn(ContextEngineBase): y_Test = np.empty([0]) # Knn object knnRegressor = None def __init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict): ContextEngineBase.__init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict) # Passed parameters self.n_neighbors = appFieldsDict['n_neighbors'] self.weights = appFieldsDict['weights'] self.algorithm = appFieldsDict['algorithm'] self.n_jobs = appFieldsDict['n_jobs'] # Defining a Knn object with given parameters self.knnRegressor = KNeighborsRegressor(n_neighbors = self.n_neighbors, weights = self.weights, algorithm = self.algorithm, n_jobs = self.n_jobs) # Add a set of training observations, with the newInputObsMatrix being a # matrix of doubles, where the row magnitude must match the number of inputs, # and the column magnitude must match the number of observations. # and newOutputVector being a column vector of doubles def addBatchObservations(self, newInputObsMatrix, newOutputVector): if(len(newInputObsMatrix.shape) == 2 and newInputObsMatrix.shape[1] == self.numInputs and newOutputVector.shape[0] == newInputObsMatrix.shape[0]): # print("All good!") newOutputVector = newOutputVector.ravel() i = 0 for newInputVector in newInputObsMatrix: newOutputValue = newOutputVector[i] self.addSingleObservation(newInputVector, newOutputValue) i += 1 else: print("Wrong dimensions!") # Train the coefficients on the existing observation matrix if there are # enough observations. def train(self): if (self.numObservations > 0): # print("Training started") self.knnRegressor.fit(self.observationMatrix, self.outputVector) return True else: print("Not enough observations to train!") return False # Execute the trained matrix against the given input observation # inputObsVector is a row vector of doubles def execute(self, inputObsVector): if(len(inputObsVector) == self.numInputs): # print("Begin execute") #x_Test = np.vstack((self.x_Test,inputObsVector)) x_Test = np.reshape(inputObsVector,(1,self.numInputs)) self.y_Test = self.knnRegressor.predict(x_Test) return self.y_Test[0] else: print("Wrong dimensions, fail to execute") return None
def impute_KNN(df,var,features,k,): var_imputer = KNeighborsRegressor(n_neighbors=k) df_full = df[df[var].isnull()==False] df_null = df[df[var].isnull()==True] var_imputer.fit(df_full[features], df_full[var]) impute = var_imputer.predict(df_null[features]) df_null[var] = impute df = df_full.append(df_null) return df
class kNN(): ''' kNN classifier ------------- ''' def __init__(self,N_i,N_o,k=5,n=20): # note: N_o=1 assumed for now self.N_i = N_i self.n = n self.i = 0 self.k = k self.X = zeros((self.n,N_i)) self.y = zeros((self.n)) self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance') self.c = 0 #self.error_rate = 0 def predict(self,x): ''' Predict -------------- ''' if self.c < 1.: print "[Warning!] No training examples!" return 0.0 elif self.c <= self.k: dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1) i_max = argmax(ind) return self.y[i_max] return self.h.predict(x)#.reshape(1,-1)) # def samples_X(self): # ''' return samples of the WEIGHTS ''' # if self.c <= 0: # return self.X[0,:] # return self.X[0:self.c,:] def update(self, x, y): ''' Update -------------- ''' self.X[self.i,:] = x self.y[self.i] = y #self.error_rate = (y - self.predict(x))**2 self.i = (self.i + 1) % self.n if self.c < self.n: self.c = self.c + 1 self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
def nearest_neighbors_impute(df, coordinate_columns, data_columns, knr_params={}): from sklearn.neighbors import KNeighborsRegressor for column in data_columns: not_null = df[column].notnull() if (~not_null).sum() == 0: continue knr = KNeighborsRegressor(**knr_params) knr.fit(df.loc[not_null,coordinate_columns], df.loc[not_null,[column]]) predicted = knr.predict(df.loc[~not_null,coordinate_columns]) df.loc[ (~not_null),[column]] = predicted
def addJKRegionLabels(self): data = zip(self.data['RA'],self.data['DEC']) randoms = zip(self.randoms['RA'],self.randoms['DEC']) finder = KMeans(n_clusters=self.config['n_jackknife']) self.data_jk_indices = finder.fit_predict(data) nbrs = KNeighborsRegressor(n_neighbors=1) nbrs.fit(data,self.data_jk_indices) self.random_jk_indices = nbrs.predict(randoms)
def compute_mse(regressor, horizon): # get wind park and corresponding target. forecast is for the target # turbine park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 3, 2004, 2005) target = windpark.get_target() # use power mapping for pattern-label mapping. Feature window length # is 3 time steps and time horizon (forecast) is 3 time steps. feature_window = 3 mapping = PowerMapping() X = mapping.get_features_park(windpark, feature_window, horizon) Y = mapping.get_labels_turbine(target, feature_window, horizon) # train roughly for the year 2004. train_to = int(math.floor(len(X) * 0.5)) # test roughly for the year 2005. test_to = len(X) # train and test only every fifth pattern, for performance. train_step, test_step = 5, 5 if(regressor == 'linear'): # fitting the pattern-label pairs reg = linear_model.LinearRegression() reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step]) y_hat = reg.predict(X[train_to:test_to:test_step]) elif(regressor == 'knn'): k_neighbors = 10 reg = KNeighborsRegressor(k_neighbors, 'uniform') # fitting the pattern-label pairs reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step]) y_hat = reg.predict(X[train_to:test_to:test_step]) else: raise Exception("No regressor set.") # naive is also known as persistance model. naive_hat = zeros(len(y_hat), dtype = float32) for i in range(0, len(y_hat)): # naive label is the label as horizon time steps before. # we have to consider to use only the fifth label here, too. naive_hat[i] = Y[train_to + (i * test_step) - horizon] # computing the mean squared errors of Linear and naive prediction. mse_y_hat, mse_naive_hat = 0, 0 for i in range(0, len(y_hat)): y = Y[train_to + (i * test_step)] mse_y_hat += (y_hat[i] - y) ** 2 mse_naive_hat += (naive_hat[i] - y) ** 2 mse_y_hat /= float(len(y_hat)) mse_naive_hat /= float(len(y_hat)) return mse_y_hat, mse_naive_hat
# Importing the dataset dataset = pd.read_csv('hazelnut.csv') X = dataset.iloc[:, [0,1,3,4,6,7,8,9,10]] y = dataset.iloc[:, 11].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=3) # Fit the classifier to the data knn.fit(X_train,y_train) y_pred = knn.predict(X_test) y_pred knn.score(X_test, y_test) df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) df plot = sns.distplot(y_test, hist=False, color="r", label="Actual Value") sns.distplot(y_pred, hist=False, color="b", label="Fitted Values" , ax=plot) from sklearn import metrics print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) plot = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
split_one = dc_listings.iloc[0:1862] split_two = dc_listings.iloc[1862:] ## 2. Holdout Validation ## from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error train_one = split_one test_one = split_two train_two = split_two test_two = split_one # First half model = KNeighborsRegressor() model.fit(train_one[["accommodates"]], train_one["price"]) test_one["predicted_price"] = model.predict(test_one[["accommodates"]]) iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1 / 2) # Second half model.fit(train_two[["accommodates"]], train_two["price"]) test_two["predicted_price"] = model.predict(test_two[["accommodates"]]) iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1 / 2) avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse]) print(iteration_one_rmse, iteration_two_rmse, avg_rmse) ## 3. K-Fold Cross Validation ##
clf_LB=KNeighborsRegressor(n_neighbors=80,weights='uniform',n_jobs=-1) clf_HB=KNeighborsRegressor(n_neighbors=30,weights='uniform',n_jobs=-1) clf_TRI=KNeighborsRegressor(n_neighbors=95,weights='uniform',n_jobs=-1) clf_HDL=KNeighborsRegressor(n_neighbors=35,weights='uniform',n_jobs=-1) clf_LDL=KNeighborsRegressor(n_neighbors=35,weights='uniform',n_jobs=-1) clf_LB.fit(X_train,y_train_LB) clf_HB.fit(X_train,y_train_HB) clf_TRI.fit(X_train,y_train_TRI) clf_HDL.fit(X_train,y_train_HDL) clf_LDL.fit(X_train,y_train_LDL) y_pred_LB,y_pred_HB,y_pred_TRI,y_pred_HDL,y_pred_LDL=\ clf_LB.predict(X_test),clf_HB.predict(X_test),clf_TRI.predict(X_test),\ clf_HDL.predict(X_test),clf_LDL.predict(X_test) y_pred_LB,y_pred_HB,y_pred_TRI,y_pred_HDL,y_pred_LDL=\ pd.DataFrame(y_pred_LB),pd.DataFrame(y_pred_HB),pd.DataFrame(y_pred_TRI),\ pd.DataFrame(y_pred_HDL),pd.DataFrame(y_pred_LDL) y_LB,y_HB,y_TRI,y_HDL,y_LDL=\ clf_LB.predict(X_train),clf_HB.predict(X_train),clf_TRI.predict(X_train),\ clf_HDL.predict(X_train),clf_LDL.predict(X_train) print("MSE_LB",mean_squared_log_error(y_train_LB,y_LB)) print("MSE_HB",mean_squared_log_error(y_train_HB,y_HB)) print("MSE_TRI",mean_squared_log_error(y_train_TRI,y_TRI)) print("MSE_HDL",mean_squared_log_error(y_train_HDL,y_HDL))
r.append(np.sqrt(r[0])) r.append(r2_score(y_test, y_pred)) r.append(round(r2_score(y_test, y_pred) * 100, 4)) return (r) """ dataframe that store the performance of each model """ accu = pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score', 'Accuracy(%)']) """ KNN METHODE @-@ """ #estimating MSLE for k=1-9 R_MSLE = [] for i in range(1, 10): KNN = KNeighborsRegressor(n_neighbors=i) KNN.fit(X_train, y_train) y_pred = KNN.predict(X_test) error = np.sqrt(mean_squared_log_error(y_test, y_pred)) R_MSLE.append(error) print("K =", i, " , Root MSLE =", error) """ plotting error """ curve = pd.DataFrame(R_MSLE) #elbow curve plt.figure(figsize=(8, 4)) plt.xticks(list(range(1, 10)), list(range(1, 10)), rotation='horizontal') plt.plot(list(range(1, 10)), R_MSLE) plt.xlabel('K') plt.ylabel('MSLE') plt.title('Error Plot for Each K') plt.savefig('KNN-Error-Plot.jpg') plt.show() """ model implementation """
regr = KNeighborsRegressor(n_neighbors=5, algorithm='ball_tree', leaf_size=1000, weights='distance', p=1) regr = MultiOutputRegressor(estimator=regr) t0 = time.time() regr.fit(x_train, y_train) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) x_test_dim = sc_x.inverse_transform(x_test) y_test_dim = sc_y.inverse_transform(y_test) y_regr_dim = sc_y.inverse_transform(y_regr) plt.scatter(x_test_dim, y_test_dim[:, 5], s=2, c='k', marker='o', label='Matlab') plt.scatter(x_test_dim, y_regr_dim[:, 5],
numpy.random.seed(seed) tf.set_random_seed(seed) # 2. 데이터 로드 print(os.getcwd()) dataset = numpy.loadtxt("./data/pima-indians-diabetes.csv", delimiter=",") X = dataset[:, 0:8] Y = dataset[:, 8] # 3. model 설정 # model = KNeighborsClassifier(n_neighbors=1) model = KNeighborsRegressor(n_neighbors=1) # model = SVC() # model complie # loss='binary_crossentropy' => sigmoid일때 쓴다. # 4. model 실행 model.fit(X, Y) # 5. 결과 출력 x_test = X y_test = Y y_predict = model.predict(x_test) print(x_test, "의 예측결과 : ", y_predict) print("acc = ", accuracy_score(y_test, y_predict))
def k_nearest(X_train, y_train, X_test, y_test, val): kernel = KNeighborsRegressor(n_neighbors=val) kernel.fit(X_train, y_train) y_pre = kernel.predict(X_test) r2, mse = show_metrics('k nearest neighbors regressor', y_test, y_pre) return r2, mse
# print("------------------------------------------------------") # the best score was obtained with this model # X_new has less features, the least important ones are cut off X_new = SelectKBest(f_regression, k=9).fit_transform(X_total, Y_total) X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split( X_new, Y_total, test_size=0.25) reg10best = KNeighborsRegressor(n_neighbors=7, weights="distance").fit( X_train_new, Y_train_new) print("KNN regression train score with 10 best features: ", reg10best.score(X_train_new, Y_train_new)) print("KNN regression test score with 10 best features: ", reg10best.score(X_test_new, Y_test_new)) Y_predictions = reg10best.predict(X_test_new) x_ticks = numpy.arange(1., len(Y_predictions) + 1, 1) errors = [] for a, b in zip(Y_predictions, Y_test_new): errors.append(abs(a - b)) # plot true values versus predicted values predictions = mpatches.Patch(color="green", label="Predictions") actual_data = mpatches.Patch(color="cyan", label="Actual data") plt.plot(x_ticks, Y_predictions, "g", x_ticks, Y_test_new, "c") plt.legend(handles=[predictions, actual_data]) plt.title("Predictions compared to actual results") plt.show() # plot error for each point
def data_cleaning(df): #Removes Unwatned Columns #Removes Unwanted Weight Classes #Calculate KNN missing Reach Vals #Reduce Outlier Data (Weights) #Fills Null Values (AGE) #Converts Categorical to Dummies #Converts Binary to Boolean #define wanted columns from OG data desired_cols = [ 'Winner', 'weight_class', 'B_age', 'B_Height_cms', 'B_Reach_cms', 'B_Weight_lbs', 'R_Height_cms', 'R_Reach_cms', 'R_Weight_lbs', 'R_age', ] df = df[desired_cols] #Fill NaN age values with column means ¯\_(ツ)_/¯ df["B_age"] = df["B_age"].fillna(df["B_age"].mean()) df["R_age"] = df["R_age"].fillna(df["R_age"].mean()) #Replace Missing Values using KNN #Combine all B and R values together for single master list r_cols = ["R_Height_cms", "R_Reach_cms"] b_cols = ["B_Height_cms", "B_Reach_cms"] header = ["Height", "Reach"] R_heights_to_reach = df[r_cols] R_heights_to_reach.columns = header B_heights_to_reach = df[b_cols] B_heights_to_reach.columns = header MasterHR = R_heights_to_reach.append(B_heights_to_reach, ignore_index=True) #Train the KNN Model num_neighbors = 3 trainer = MasterHR.dropna() X = np.array(list(trainer["Height"])).reshape(len(trainer), 1) y = np.array(list(trainer["Reach"])).reshape(len(trainer), 1) nay = KNeighborsRegressor(n_neighbors=num_neighbors).fit(X, y) #Replace vals with KNN predictions df["R_Reach_cms"] = df.apply( lambda x: nay.predict(np.array(x["R_Height_cms"]).reshape(1, 1))[0][0] if math.isnan(x["R_Reach_cms"]) else x["R_Reach_cms"], axis=1) df["B_Reach_cms"] = df.apply( lambda x: nay.predict(np.array(x["B_Height_cms"]).reshape(1, 1))[0][0] if math.isnan(x["B_Reach_cms"]) else x["B_Reach_cms"], axis=1) #Remove unwanted weight divisions (ROWS) to_drop = [ 'Open Weight', 'Catch Weight', "Women's Strawweight", "Women's Flyweight", "Women's Bantamweight", "Women's Featherweight" ] for each in to_drop: df = df[df["weight_class"] != each] #Convert Binary Winner to Boolean df["Winner"] = df["Winner"].apply(lambda x: True if x == "Red" else False) #Calculate Delta Values (RED WINNER MINUS BLUE LOSER) df["Reach_Delta"] = df["R_Reach_cms"] - df["B_Reach_cms"] df["Height_Delta"] = df["R_Height_cms"] - df["B_Height_cms"] df["Weight_Delta"] = df["R_Weight_lbs"] - df["B_Weight_lbs"] df["age_Delta"] = df["R_age"] - df["B_age"] #Drop Figthers over the 265 Heavyweight limit df = df[df["R_Weight_lbs"] <= 265] df = df[df["B_Weight_lbs"] <= 265] #Drop Red vs Blue Data Columns cols = [ "R_Reach_cms", "B_Reach_cms", "R_Height_cms", "B_Height_cms", "R_Weight_lbs", "B_Weight_lbs", "R_age", "B_age" ] df = df.drop(columns=cols) df = dummies(df, "weight_class") return df
print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, prediction)}") print(f"Printing MSE error: {metrics.mean_squared_error(y_test, prediction)}") print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, prediction))}") print(f"Printing r2 score linear regression: {metrics.r2_score(y_test, prediction)}") ######################################################################### kreg = KNeighborsRegressor() kreg.fit(X_train, y_train) # print(f"Intercept2: {linear.intercept_}\n") # print(f"Coeficients2: {linear.coef_}\n") # print(f"Named Coeficients2: {pd.DataFrame(linear.coef_, columns_names)}") prediction2 = kreg.predict(X_test) for (real, predicted) in list(zip(y_test, prediction2)): print(f"Value: {real:.2f}, pred: {predicted:.2f}, diff: {(real - predicted):.2f}") sns.set(palette="inferno") sns.scatterplot(y_test, prediction2) plt.plot([0, 50], [0, 50], '--') plt.title('(KNeighbors)') plt.xlabel('Real Value') plt.ylabel('Predicted Value') plt.show() residuals = y_test - prediction2
'preprocessing.{}().fit_transform(XTransaction)'.format(scaler)) XPS_train, XPS_test, yP_train, yP_test = train_test_split( XPricingS, yPricing, test_size=0.3) XTS_train, XTS_test, yT_train, yT_test = train_test_split( XTransactionS, yTransaction, test_size=0.3) knnP = KNeighborsRegressor(n_neighbors=20) knnP.fit(XPS_train, yP_train) # print(knnP.predict(XPS_test[:10])) # print(np.array(yP_test[:10])) # print(knnP.score(XPS_test, yP_test)) pricingScores[scalerIndex] += knnP.score(XPS_test, yP_test) plt.scatter(knnP.predict(XPS_test), np.array(yP_test)) plt.title( 'Prediction on Pricing using Normalization of {}'.format(scaler)) plt.xlabel('Predicted Pricing Price') plt.ylabel('Actual Pricing Price') plt.figtext( 0.6, 0.8, 'KNN Score: {}'.format(round(knnP.score(XPS_test, yP_test), 3))) plt.savefig('scaler graphs/{} Prediction'.format(scaler)) plt.clf() knnT = KNeighborsRegressor(n_neighbors=20) knnT.fit(XTS_train, yT_train) # print(knnT.predict(XTS_test[:10])) # print(np.array(yT_test[:10]))
class CipPredictor(CipDatabase): """Extend the database with a predictor.""" def __init__(self): CipDatabase.__init__(self) self.num_scores_fitted = 0 self.X = [] self.y = [] #self.predictor = SGDRegressor() self.predictor = KNeighborsRegressor() def create_features(self): """Create feature vectors between database pairs.""" self.X = [] self.y = [] self.cip_fvs = self.vectorizer.transform_single(self.cip_graphs) for interface, core_start, core_end, scores in self.get_items(): pos_start = self.graph2position[interface][core_start] pos_end = self.graph2position[interface][core_end] vector_start = self.cip_fvs[pos_start] vector_end = self.cip_fvs[pos_end] feature_vector = vector_start - vector_end if len(feature_vector.data): score = median(scores) max_drift = max(score - min(scores), max(scores) - score) self.X.append(feature_vector) self.y.append(score) self.X = vstack(self.X) def cip_fit(self): """Fit the predictor.""" self.predictor.fit(self.X, self.y) self.num_scores_fitted = self.num_scores def _predicted_cips(self, original_cip, candidate_cips): """Return average scores of a list of candidate cips.""" original_fv = self.vectorizer.transform_single(original_cip.graph) original_fv = original_fv[0] candidate_graphs = [candidate_cip.graph for candidate_cip in candidate_cips] candidate_fvs = self.vectorizer.transform_single(candidate_graphs) pairwise_fvs = [original_fv - candidate_fv for candidate_fv in candidate_fvs] pairwise_fvs = vstack(pairwise_fvs) y = self.predictor.predict(pairwise_fvs) return zip(y, candidate_cips) def save_cip_data(self): """Save database and feature vectors to files.""" CipDatabase.save_cip_data(self) dump_svmlight_file(self.X, self.y, 'cip_rank_regression.data', zero_based=False) def load_cip_data(self): """ Load database and feature vectors from files.""" CipDatabase.load_cip_data(self) self.X, self.y = load_svmlight_file('cip_rank_regression.data', n_features=self.vectorizer.feature_size, zero_based=False)
mpg.dropna(inplace=True) #print(mpg.shape) mpg_target = mpg["mpg"] mpg_target = np.asarray(mpg_target) mpg_data = mpg.iloc[:, 1:7] mpg_train_data, mpg_test_data, mpg_train_target, mpg_test_target = train_test_split(mpg_data, mpg_target, train_size = .7, test_size = 0.3, random_state = 48, shuffle = True) std_scale = preprocessing.StandardScaler().fit(mpg_train_data) mpg_train_data_std = std_scale.transform(mpg_train_data) mpg_test_data_std = std_scale.transform(mpg_test_data) print() for i in range(3,9): regr = KNeighborsRegressor(n_neighbors=i) regr.fit(mpg_train_data_std, mpg_train_target) mpg_prediction = regr.predict(mpg_test_data_std) accuracy = regr.score(mpg_test_data_std, mpg_test_target) #print("When k = {} Accuracy: {:.2f}%".format(i, accuracy * 100)) #The Student Math Data binary = {"school" : {"GP":1, "MS":0}, "sex" : {"M":1, "F":0}, "address" : {"R":1, "U":0}, "famsize" : {"GT3":1, "LE3":0}, "Pstatus" : {"T":1, "A":0}, "schoolsup" : {"yes":1, "no":0}, "famsup" : {"yes":1, "no":0}, "paid" : {"yes":1, "no":0}, "activities" : {"yes":1, "no":0}, "nursery" : {"yes":1, "no":0},
data = load_boston() # 分割数据 train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, test_size=0.25, random_state=33) # 使用 AdaBoost 回归模型 regressor = AdaBoostRegressor() regressor.fit(train_x, train_y) pred_y = regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) # 使用决策树回归模型 dec_regressor = DecisionTreeRegressor() dec_regressor.fit(train_x, train_y) pred_y = dec_regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print(" 决策树均方误差 = ", round(mse, 2)) ''' 决策树均方误差 = 28.19 ''' # 使用 KNN 回归模型 knn_regressor = KNeighborsRegressor() knn_regressor.fit(train_x, train_y) pred_y = knn_regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print("KNN 均方误差 = ", round(mse, 2)) ''' KNN 均方误差 = 27.87 '''
# Create numpy arrays x = np.asarray(x, dtype=float) y = np.asarray(y, dtype=float) forcast = np.asarray(forecast, dtype=float) # Give data the correct dimensions x, y, forecast = x.reshape(len(x), 1), y.reshape(len(y), 1), forcast.reshape(len(forcast), 1) # Train knnReg = KNeighborsRegressor(10, 'uniform').fit(x, y) # Predict prediction = knnReg.predict(forecast) # Write to file printlist = read.convert(prediction) read.writeToFile("ForecastTemplate1-kNN.csv", dates, printlist) # Calculate RMSE sum_errors = 0 for i in range(len(prediction)): sum_errors += math.pow(2, (float(prediction[i]) - float(solution[i]))) rmse = math.sqrt(sum_errors / len(prediction)) print(" ") print("Prediction done using K-Nearest Neighbor") print("RMSE: " + str(rmse))
from sklearn.tree import DecisionTreeRegressor N = 200 X = np.linspace(0, 10, N).reshape(N, 1) Y = np.sin(X) Ntrain = 20 idx = np.random.choice(N, Ntrain) Xtrain = X[idx] Ytrain = Y[idx] # it weights the neighbors by 'distance' instead # of just averaging the neighbors knn = KNeighborsRegressor(n_neighbors=2, weights='distance') knn.fit(Xtrain, Ytrain) knn = knn.predict(X) # because we didnt set max_depth of the tree during # the training it overfit the training data dt = DecisionTreeRegressor() dt.fit(Xtrain, Ytrain) Ydt = dt.predict(X) plt.scatter(Xtrain, Ytrain) # show the training points plt.plot(X, Y) # show the original data plt.plot(X, Yknn, label='KNN') plt.plot(X, Ydt, label='Decision Tree') plt.legend() plt.show()
#%% kNN Regression - Finding optimal k mse_mean_list = [] mse_var_list = [] r2_mean_list = [] r2_var_list = [] gamma = 25 k_space = [] kf = KFold(n_splits=10, shuffle=True) for k in range(2, 10, 1): mse_list = [] k_space.append(k) for train, test in kf.split(X): model_knn = KNeighborsRegressor(n_neighbors=k, weights='distance').fit( X[train], Y[train]) ypred_knn = model_knn.predict(X[test]) mse_list.append(mean_squared_error(Y[test], ypred_knn)) r2_list.append(r2_score(Y[test], ypred_knn)) mse_mean_list.append(np.mean(mse_list)) mse_var_list.append(np.var(mse_list)) r2_mean_list.append(np.mean(r2_list)) r2_var_list.append(np.var(r2_list)) #%% fig03 = plt.figure(figsize=(10, 5)) ax03 = fig03.add_subplot(1, 1, 1) Ctext03 = "Variation in Mean Squared Error with $k$ for kNN Regression, weighted by distance" ax03.set_title(Ctext03, fontweight="bold", fontsize=13) ax03.set_xlabel('Number of Nearest Neighbours ($k$)', fontweight="bold",
# print("The MSE is:", a, k) #5 MSE = 33.529 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) knnr = KNeighborsRegressor(n_neighbors=5, p=2) knnr.fit(X_train, y_train) #y_pred = classifier.predict(X_test) y_pred = knnr.predict(X_test) a = mean_squared_error(y_test, y_pred) print("The MSE is:", a) # Vote share of incumbent president fig = plt.figure(figsize=(12, 9)) ax = sns.regplot(y_test, y_pred, marker='o', color='blue') ax.set_title('KNN Regression', fontsize=20) ax.set_xlabel('Actual Democratic Vote Share', fontsize=20) ax.set_ylabel('Predicted Democratic Candidate Vote Share', fontsize=20) plt.show() #>> Final Prediction << #Predict out of sample data
#mdl2 = KNeighborsRegressor(5); #mdl2.fit(X2,Y2); #print(mdl2.score(tX2,tY2)) #print(mdl2.score(X2,Y2)) #print(mdl.score(tX2,tY2)) #plt.scatter(T,P, color='blue') #plt.plot([0,40],[b,m*40+b],'r') #plt.title('Vasai Median Prices', fontsize = 15) #plt.xlabel('Quarter (0 = 2009-Q1, 40 = 2022-Q2)', fontsize = 15) #plt.ylabel('rs/sq.ft', fontsize = 15) #plt.show() tYY = tY.values hy = mdl.predict(tX) dtot = 0 tot = 0 mae_num = 0 mae_den = 0 mape = 0 for i in range(0, 1026): cu = 0 if (tYY[i][0] > hy[i][0]): cu = (tYY[i][0] - hy[i][0]) else: cu = -(tYY[i][0] - hy[i][0]) mae_num = mae_num + cu mae_den = mae_den + tYY[i][0]
#X_test, y_test = X[offset:], y[offset:] # We will change k from 1 to 30 k_range = arange(1, 30) train_err = zeros(len(k_range)) test_err = zeros(len(k_range)) for i, k in enumerate(k_range): # Set up a KNN model that regressors over k neighbors neigh = KNeighborsRegressor(n_neighbors=k) # Fit the learner to the training data neigh.fit(X_train, y_train) # Find the MSE on the training set train_err[i] = mean_squared_error(y_train, neigh.predict(X_train)) # Find the MSE on the testing set test_err[i] = mean_squared_error(y_test, neigh.predict(X_test)) # Plot training and test error as a function of k pl.figure() pl.title('kNN: Error as a function of k') pl.plot(k_range, test_err, lw=2, label = 'test error') pl.plot(k_range, train_err, lw=2, label = 'training error') pl.legend() pl.xlabel('k') pl.ylabel('RMS error') pl.show() # <headingcell level=3>
thirdAirline = input('-->') # Setting up data and target values data = dataset.iloc[:, [4, 5, 7, 8, 15]] target = dataset.iloc[:,10:12].values # Unique test cases predictX = [ [35.220448, -80.94377, 40.77289, -73.868805, airlineMap[firstAirline]], [47.44359, -122.302505, 33.640545, -84.43341, airlineMap[secondAirline]], [40.69297, -74.17799, 37.616714, -122.38709, airlineMap[thirdAirline]], ] # Here we are using a KNN Regression to deal with the multitarget output # If there was not a multitarget output, a normal KNN Classifier would have worked fine # Using 1 cluster to guarantee that the resulting layover shows an actual airport location knn = KNeighborsRegressor(n_neighbors = 1) knn.fit(data, target) predictionResults = knn.predict(predictX) # Demonstration of the results for i in range(3): if airportMap[predictionResults[i][0]] == 'ZZZ' || : airportMap[predictionResults[i][0]] = 'no airport!' print("Flying from CLT to LGA on", firstAirline, "would likely result in a layover in",airportMap[predictionResults[0][0]]) print("Flying from SEA to ATL on", secondAirline, "would likely result in a layover in",airportMap[predictionResults[1][0]]) print("Flying from EWR to SFO on", secondAirline, "would likely result in a layover in",airportMap[predictionResults[2][0]])
# Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score #KNN REGRESSOR from sklearn.neighbors import KNeighborsRegressor neigh = KNeighborsRegressor(n_neighbors=10, metric='chebyshev') neigh.fit(X_train, y_train) knn_pred = neigh.predict(X_test) mean_squared_error(y_test, knn_pred) #random forest from sklearn.ensemble import RandomForestRegressor regressor1 = RandomForestRegressor(n_estimators=250, random_state=0) regressor1.fit(X_train, y_train) y_pred2 = regressor1.predict(X_test) mean_squared_error(y_test, y_pred2) from sklearn.linear_model import LinearRegression regressor2 = LinearRegression() regressor2.fit(X_train, y_train) y_pred3 = regressor2.predict(X_test)
df = pd.read_csv('dataset.csv', delimiter=',', decimal=',') df = df.dropna() print(df.columns) Y = df['% Silica Concentrate'] X = df.drop(['% Silica Concentrate', 'date'], axis=1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) start_time = time.time() model = KNeighborsRegressor(n_neighbors=5) model.fit(X_train, Y_train) end_time = time.time() print("Time elapsed: ", end_time - start_time) Y_pred = model.predict(X_test) print("Time elapsed: ", time.time() - end_time) error = mean_squared_error(Y_pred, Y_test) print(error)
def run_models(grid_y, grid_x): X, Y = create_training_and_testing_data(grid_x, grid_y) data = Table(X, Y) # print(data.Y) # np.savetxt('data/' + str(grid_x) + '_' + str(grid_y) + '.csv', np.array(data), delimiter=',', fmt='%10.5f') # print(out_data.domain) # print(out_data.Y) # feature_method = og.preprocess.score.UnivariateLinearRegression() # selector = og.preprocess.SelectBestFeatures(method=feature_method, k=10) # out_data2 = selector(data) # plot_input(out_data2.X, out_data2.Y) # print(out_data2.domain) # pca = PCA(n_components=5) # model = pca(out_data2) # out_data = model(out_data2) # print(out_data.domain) test = og.data.Table(data.domain, random.sample(data, 60)) train = og.data.Table(data.domain, [d for d in data if d not in test]) lin = og.regression.linear.LinearRegressionLearner() rf = og.regression.random_forest.RandomForestRegressionLearner() nnr = og.regression.NNRegressionLearner() svm = og.regression.SVRLearner() knn = KNeighborsRegressor(n_neighbors=3) learners = [lin, rf, nnr, svm] regressors = [learner(train) for learner in learners] knn.fit(train.X, train.Y) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_lin.pickle", "wb") as f: pickle.dump(lin, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_rf.pickle", "wb") as f: pickle.dump(rf, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_nnr.pickle", "wb") as f: pickle.dump(nnr, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_svm.pickle", "wb") as f: pickle.dump(svm, f) with open( "models1155_1683_1s/" + str(grid_x) + "_" + str(grid_y) + "_knn.pickle", "wb") as f: pickle.dump(knn, f) # print((r(test)[0] for r in regressors)) linPredict = regressors[0](test) rfPredict = regressors[1](test) nnrPredict = regressors[2](test) svmPredict = regressors[3](test) knnPredict = knn.predict(test.X) predictions = [] predictions.append(linPredict) predictions.append(rfPredict) predictions.append(nnrPredict) predictions.append(svmPredict) predictions.append(knnPredict) # print(knnPredict) # print("y ", " ".join("%5s" % l.name for l in regressors)) # for d in test: # print(("{:<5}" + " {:5.1f}" * len(regressors)).format(d.get_class(), *(r(d)[0] for r in regressors))) # res = og.evaluation.CrossValidation(test, learners, k=10) # rmse = og.evaluation.RMSE(res) # mae = og.evaluation.MAE(res) # r2 = og.evaluation.R2(res) rmse = [] mae = [] rmse.append(math.sqrt(mean_squared_error(test.Y, linPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, rfPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, nnrPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, svmPredict))) rmse.append(math.sqrt(mean_squared_error(test.Y, knnPredict))) mae.append(mean_absolute_error(test.Y, linPredict)) mae.append(mean_absolute_error(test.Y, rfPredict)) mae.append(mean_absolute_error(test.Y, nnrPredict)) mae.append(mean_absolute_error(test.Y, svmPredict)) mae.append(mean_absolute_error(test.Y, knnPredict)) return np.array(mae), np.array(rmse), np.array(predictions), test
def run_baseline_model(epa_data, modis_means): dates = set() for date in epa_data['Date']: dates.add(date) MSE = 0 num_predictions = 0 all_date_y_train = [] all_date_y_test = [] all_date_y_pred = [] all_date_y_train_preds = [] all_date_epa_site_train_order = [] all_date_epa_site_test_order = [] all_dates_train = [] all_dates_test = [] # Goes date by date to get MSE # Each date has a list of stations that have measuerments from that date for idx, date in enumerate(dates): if idx % 10 == 0: print("Processing date {}: {} ".format(idx, date)) date_df = epa_data[epa_data['Date'] == date] # X info is latitude, longitude; y is PM2.5; epa_set_ids tracks corresponnding site_ids X = [] y = [] epa_site_ids = [] cur_date = [] for i in range(len(date_df)): lat = np.radians(date_df['SITE_LATITUDE'][date_df.index[i]]) long = np.radians(date_df['SITE_LONGITUDE'][date_df.index[i]]) pm = date_df['Daily Mean PM2.5 Concentration'][date_df.index[i]] epa_site_id = date_df['Site ID'][date_df.index[i]] X.append([lat, long]) y.append(pm) epa_site_ids.append(epa_site_id) cur_date.append(date) # Shuffle data and split into train/test sets X, y, epa_site_ids = shuffle(X, y, epa_site_ids) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, shuffle=False) # already shuffled X_train_, X_test_, epa_site_train, epa_site_test = train_test_split( X, epa_site_ids, test_size=0.3, shuffle=False) _, _, cur_date_train, cur_date_test = train_test_split(X, cur_date, test_size=0.3, shuffle=False) # nearest neighbors, as determined by haversine (distance between latitude,longitude coordinate pairs) knn = KNeighborsRegressor(n_neighbors=1, metric="haversine") knn.fit(X_train, y_train) y_pred = knn.predict(X_test) # Combine PM prediction from nearest neighbor with 2x2 aod data in simple linear regression model # Get the nearest neighbors of train data (not including point itself) y_train_nn_indices = knn.kneighbors(X_train)[1] y_train_nn_indices = [y for x in y_train_nn_indices for y in x] # flatten the list y_train_preds = np.asarray(y_train)[y_train_nn_indices] all_date_y_train_preds.append(y_train_preds.tolist()) all_date_y_pred.append(y_pred) all_date_y_train.append(y_train) all_date_y_test.append(y_test) all_date_epa_site_train_order.append(epa_site_train) all_date_epa_site_test_order.append(epa_site_test) all_dates_train.append(cur_date_train) all_dates_test.append(cur_date_test) # Flatten list all lists all_date_y_train_preds = flatten(all_date_y_train_preds) all_date_y_pred = flatten(all_date_y_pred) all_date_y_train = flatten(all_date_y_train) all_date_y_test = flatten(all_date_y_test) all_date_epa_site_train_order = flatten(all_date_epa_site_train_order) all_date_epa_site_test_order = flatten(all_date_epa_site_test_order) all_dates_train = flatten(all_dates_train) all_dates_test = flatten(all_dates_test) X_aod_train = np.asarray(all_date_y_train_preds).reshape(-1, 1) X_aod_test = np.asarray(all_date_y_pred).reshape(-1, 1) num_sites_for_all_dates_train = len(all_date_epa_site_train_order) num_sites_for_all_dates_test = len(all_date_epa_site_test_order) green_means_train = np.zeros((num_sites_for_all_dates_train, 1)) blue_means_train = np.zeros((num_sites_for_all_dates_train, 1)) green_means_test = np.zeros((num_sites_for_all_dates_test, 1)) blue_means_test = np.zeros((num_sites_for_all_dates_test, 1)) print("Beginning mean lookup") for idx, epa_site in enumerate(all_date_epa_site_train_order): modis_filename = epa_to_modis_file_name(all_dates_train[idx], epa_site) modis_row = modis_means[modis_means['Filename'] == modis_filename] green_mean = modis_row['Green mean'][modis_row.index[0]] blue_mean = modis_row['Blue mean'][modis_row.index[0]] green_means_train[idx] = green_mean blue_means_train[idx] = blue_mean for idx, epa_site in enumerate(all_date_epa_site_test_order): modis_filename = epa_to_modis_file_name(all_dates_test[idx], epa_site) modis_row = modis_means[modis_means['Filename'] == modis_filename] green_mean = modis_row['Green mean'][modis_row.index[0]] blue_mean = modis_row['Blue mean'][modis_row.index[0]] green_means_test[idx] = green_mean blue_means_test[idx] = blue_mean print("Finished mean lookup") X_aod_train = np.concatenate( (X_aod_train, green_means_train, blue_means_train), axis=1) X_aod_test = np.concatenate( (X_aod_test, green_means_test, blue_means_test), axis=1) print("Training LR") reg = LinearRegression().fit(X_aod_train, all_date_y_train) r2_score_train = reg.score(X_aod_train, all_date_y_train) r2_score_test = reg.score(X_aod_test, all_date_y_test) print("R2 train: {}".format(r2_score_train)) print("R2 test: {}".format(r2_score_test)) y_pred_lr = reg.predict(X_aod_test) diff = np.square(np.asarray(y_pred_lr) - np.asarray(all_date_y_test)) MSE = diff.sum() num_predictions = len(all_date_y_test) #print("Adding squared error of {} for date {}.".format(diff.sum()/len(y_test), date)) MSE = MSE / num_predictions print("Mean squared error across all dates: {}".format(MSE))
df['healsPerWalkDistance'].fillna(0, inplace=True) df['healsAndBoostsPerWalkDistance'] = df['healsAndBoosts'] / ( df['walkDistance'] + 1) df['healsAndBoostsPerWalkDistance'].fillna(0, inplace=True) df['killsPerWalkDistance'] = df['kills'] / (df['walkDistance'] + 1) df['killsPerWalkDistance'].fillna(0, inplace=True) return df train = addFeatures(pd.read_csv('inputs/train_V2.csv')) test = addFeatures(pd.read_csv('inputs/test_V2.csv')) from sklearn.neighbors import KNeighborsRegressor neigh = KNeighborsRegressor(n_neighbors=3) neigh.fit( train[[ 'weaponsAcquired', 'killPlace', 'totalDistance', 'killsPerWalkDistance', 'healsAndBoostsPerWalkDistance' ]][:700000], train['winPlacePerc'][:700000]) predcited = neigh.predict(train[[ 'weaponsAcquired', 'killPlace', 'totalDistance', 'killsPerWalkDistance', 'healsAndBoostsPerWalkDistance' ]][800000:890000]) from sklearn.metrics import explained_variance_score EVS = explained_variance_score(train['winPlacePerc'][800000:890000], predcited) print(EVS)
[[10191 0 0 0 0 0] [ 1 11 0 0 0 0] [ 0 0 43 0 0 0] [ 3 0 2 2065 0 0] [ 0 0 0 0 731 0] [ 0 0 0 0 0 2]] from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(model_DT,x_test_std,y_test) ## for confusion matrix plot, refer to coding on Google Colab ## #4) Prediction on unknown data #4.1) Using Predict() function with Decision Trees from sklearn.tree import DecisionTreeRegressor model_DTR = DecisionTreeRegressor(max_depth=5).fit(x_train,y_train) DT_predict = model_DTR.predict(x_test) #Predictions on Testing data print(DT_predict) ## [1.0020645 4. 1.0020645 ... 1.0020645 1.0020645 1. ] #4.2) Using Predict() function with KNN from sklearn.neighbors import KNeighborsRegressor KNN_model = KNeighborsRegressor(n_neighbors=3).fit(x_train,y_train) ## [1. 4. 1. ... 1. 1. 1.] KNN_predict = KNN_model.predict(x_test) #Predictions on Testing data print(KNN_predict)
''' Created on 2017. 8. 6. @author: jaehyeong ''' import mglearn from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split # dataset X, y = mglearn.datasets.make_wave(n_samples=40) # train & test X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) # k = 3, 객체생성 reg = KNeighborsRegressor(n_neighbors=3) # 학습 reg.fit(X_train, y_train) # test셋에 대한 예측 print('test set 예측 : ',reg.predict(X_test)) # score( R^2 : 결정계수 ) print('test set R^2 : ', reg.score(X_test, y_test)) # 0.834417244625
def regression_data(): cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF', '#000000']) plt.figure(1) plt.title('Sample regression problem with one input variable') X_R1, y_R1 = make_regression(n_samples=100, n_features=1, n_informative=1, bias=150.0, noise=30, random_state=0) plt.scatter(X_R1, y_R1, marker='o', s=50) plt.show() X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0) knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train) print(knnreg.predict(X_test)) print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test))) fig, subaxes = plt.subplots(1, 2, figsize=(8, 4)) X_predict_input = np.linspace(-3, 3, 50).reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state=0) for thisaxis, K in zip(subaxes, [1, 3]): knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train) y_predict_output = knnreg.predict(X_predict_input) thisaxis.set_xlim([-2.5, 0.75]) thisaxis.plot(X_predict_input, y_predict_output, '^', markersize=10, label='Predicted', alpha=0.8) thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8) thisaxis.set_xlabel('Input feature') thisaxis.set_ylabel('Target value') thisaxis.set_title('KNN regression (K={})'.format(K)) thisaxis.legend() plt.tight_layout() plt.show() fig, subaxes = plt.subplots(5, 1, figsize=(5, 20)) X_predict_input = np.linspace(-3, 3, 500).reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0) for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]): knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train) y_predict_output = knnreg.predict(X_predict_input) train_score = knnreg.score(X_train, y_train) test_score = knnreg.score(X_test, y_test) thisaxis.plot(X_predict_input, y_predict_output) thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train') thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test') thisaxis.set_xlabel('Input feature') thisaxis.set_ylabel('Target value') thisaxis.set_title( 'KNN Regression (K={}) Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$'. format(K, train_score, test_score)) thisaxis.legend() plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) plt.show()
lambda row: distance.euclidean(row, lebron_normalized), axis=1) sort_distances = euclidean_distances.sort_values() closest = sort_distances.iloc[1:2] most_similar_to_lebron = nba.iloc[closest.index[0]]['player'] print(most_similar_to_lebron) ## 7. Using sklearn ## # The columns that we will be making predictions with. x_columns = [ 'age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf' ] # The column that we want to predict. y_column = ["pts"] from sklearn.neighbors import KNeighborsRegressor # Create the knn model. knn = KNeighborsRegressor(n_neighbors=5) # Fit the model on the training data. knn.fit(train[x_columns], train[y_column]) # Make predictions on the test set using the fit model. predictions = knn.predict(test[x_columns]) ## 8. Computing error ## actual = test[y_column] mse = (((predictions - actual)**2).sum()) / len(predictions) print(mse)