def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5): model = KNeighborsRegressor(n_neighbors=n_neighbors) model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) score = model.score(features_train, labels_train) print "KNeighbors - coefficient of determination R^2 of the prediction: ", score return labels_pred
def fill_income(df): income_imputer = KNeighborsRegressor(n_neighbors=2) df_w_monthly_income = df[df.monthly_income.isnull() == False].copy() df_w_null_monthly_income = df[df.monthly_income.isnull() == True].copy() cols = ["number_real_estate_loans_or_lines", "number_of_open_credit_lines_and_loans"] income_imputer.fit(df_w_monthly_income[cols], df_w_monthly_income.monthly_income) new_values = income_imputer.predict(df_w_null_monthly_income[cols]) df_w_null_monthly_income.loc[:, "monthly_income"] = new_values df2 = df_w_monthly_income.append(df_w_null_monthly_income) return df2
def calc_linear_regression(reg_training_path): dataset = read_reg_train_data(reg_training_path) rmse = 0 n_folds = 5 folds = KFold(n=len(dataset), n_folds=n_folds, shuffle=False) fold = 0 for train_indices, test_indices in folds: fold += 1 training_set = [dataset[i] for i in train_indices] test_set = [dataset[i] for i in test_indices] training_dataframe = get_data_frame(training_set) test_dataframe = get_data_frame(test_set) column_names = ['cf_item', 'cf_user', 'svd', 'content_item', 'actual_rating'] training_dataframe.columns = column_names test_dataframe.columns = column_names actual_rating_training_column = training_dataframe['actual_rating'] #actual_rating_test_column = test_dataframe['actual_rating'] training_dataframe = training_dataframe.drop('actual_rating', axis=1) test_dataframe = test_dataframe.drop('actual_rating', axis=1) neigh = KNeighborsRegressor(n_neighbors=10) #print('Initialized k nearest neighbors regressor with k =', i) neigh.fit(training_dataframe, actual_rating_training_column) #print('Fit data models') predict_set = neigh.predict(test_dataframe) print(predict_set) rmse += mean_squared_error([rec[4] for rec in test_set], [rec for rec in predict_set]) ** 0.5 print("Fold (%d) finished with accumulated RMSE of (%f) (%s)" % (fold, rmse, time.strftime('%y_%m_%d_%H_%M_%S'))) return rmse / float(n_folds)
def run_kNeighbors(distances, loadings, test_vars, weightings=('uniform',), k_list=(3)): """ Run Knearest neighbor using precomputed distances to create an ontological mapping Args: distances: square distance matrix to pass to KNeighborsRegressors loadings: loading matrix for training test_vars: variable to reconstruct weightings: (optional) list of weightings to pass to KNeighbors k_list: list of k values to pass to KNeighbors as n_neighbors """ train_distances = distances.loc[loadings.index, loadings.index] test_distances = distances.loc[test_vars, loadings.index] to_return = pd.DataFrame() for weighting in weightings: for k in k_list: clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting) clf.fit(train_distances, loadings) out = clf.predict(test_distances) out = pd.DataFrame(out, columns=loadings.columns) out['var'] = test_vars out['k'] = k out['weighting'] = weighting # add neighbors and distances neighbors = clf.kneighbors(test_distances) out['distances'] = tuple(neighbors[0]) out['neighbors'] = tuple(test_distances.columns[neighbors[1]]) to_return = pd.concat([to_return, out], sort=False) return to_return
def knn_model(train, y_train, test): model = KNeighborsRegressor(n_neighbors = 10, weights='distance', n_jobs=-1) model.fit(train, y_train) test_probs = model.predict(test) indices = test_probs < 0 test_probs[indices] = 0 return test_probs
def main(featureFile, outputfolder): with open(featureFile, 'r') as csvfile: my_data = pd.read_csv(csvfile, delimiter="\t", low_memory=False) random_indices = permutation(my_data.index) # how many time do we want the data in our test set? test_cutoff = math.floor(len(my_data)/3) test = my_data # Generate the training set with the rest of the data. train = my_data.loc[random_indices[test_cutoff:]] x_columns = ["Row"=="1", "Student ID"=="2", "Problem Hierarchy" == "3", "Problem Name"=="4", "Problem View" == "5", "Step Name" == "6", "KC(Default)"=="7", "Opportunity (Default)" == "8"] x_columns = [int(i) for i in x_columns] # y columns show the predicted feature, in this case, the correct first attempt y_column = ["Correct First Attempt"] # Look at the Ten closest neighbors, to offset potential noise in the data knn = KNeighborsRegressor(n_neighbors=10) knn.fit(train[x_columns], train[y_column]) # Make point predictions on the test set using the fit model. predictions = knn.predict(test[x_columns]) actual = test[y_column] result = test[['Anon Student Id','Correct First Attempt']] result.to_csv(outputfolder, sep='\t') # Compute the root mean squared error of our predictions. rmse = math.sqrt((((predictions - actual) ** 2).sum()) / len(predictions)) print('RMSE=') print(rmse)
def apply_knn(): regr = KNeighborsRegressor() regr.fit(Xtr, Ytr) pred = regr.predict(Xte) temp = mean_squared_error(Yte, pred) return pred, temp
def transform(self, X, y=None): """ :param X: multidimensional numpy array like. """ rows, features = X.shape mask = list(map(lambda x: reduce(lambda h, t: h or t, x), np.isnan(X))) criteria_for_bad = np.where(mask)[0] criteria_for_good = np.where(mask == np.zeros(len(mask)))[0] X_bad = X[criteria_for_bad] X_good = X[criteria_for_good] knn = KNeighborsRegressor(n_neighbors=self.k) for idx, x_bad in zip(criteria_for_bad.tolist(), X_bad): missing = np.isnan(x_bad) bad_dim = np.where(missing)[0] good_dim = np.where(missing == False)[0] for d in bad_dim: x = X_good[:, good_dim] y = X_good[:, d] knn.fit(x, y) X[idx, d] = knn.predict(x_bad[good_dim]) return X
def kNN(X_train, y_train, X_test, y_test, uselog=False): ''' :param X_train: :param y_train: :param X_test: :param y_test: :return: ''' scaler = StandardScaler() print X_train.shape print X_test.shape X = scaler.fit_transform(X_train) test = scaler.transform(X_test) clf = KNeighborsRegressor(n_neighbors=550) clf.fit(X, y_train) result = clf.predict(test) if uselog: result = map(lambda x: math.log(1 + x), result) return result
def knnPredictor(df): dataTrainX, dataTrainY, dataTestX, dataTestY = sample(df) corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] for k in range(1, 200, 1): knnModel = KNeighborsRegressor(n_neighbors=k) knnModel.fit(dataTrainX, dataTrainY) knnpredicted = knnModel.predict(dataTestX) corelationCoefficient = pearsonr(dataTestY, knnpredicted) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) # plotter.plot(corelationCoefficiantArray) bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) knnModelBest = KNeighborsRegressor(n_neighbors=bestK) knnModelBest.fit(dataTrainX, dataTrainY) print("K = ") print(bestK) print("Corelation Coeff:") print(corelationCoefficiantDictionary[bestK]) knnpredictedBest = knnModelBest.predict(dataTestX) fig, ax = plotter.subplots() corelationCoefficient = pearsonr(dataTestY, knnpredictedBest) print(corelationCoefficient[0]) ax.set_ylabel('Predicted KNN Weekly') ax.scatter(dataTestY, knnpredictedBest) ax.set_xlabel('Measured') plotter.show()
def k_nearest_neighbours(): filepath = "bondchanges.arff" all_data = arff_read_to_array(filepath) X_data = all_data["data"] Y_data = all_data["target"] Y_data_map = {} new_Y_data = np.array([]) i = 01 for index,data in enumerate(Y_data): data1 = data.split('_')[0] split_data = (".").join(data1.split('.')[:1]) if not split_data in Y_data_map: Y_data_map[split_data] = i i+=1 print split_data new_Y_data = np.append(new_Y_data,[Y_data_map[split_data]],0) #Create X_training = X_data[:0.9*len(X_data)] Y_training = new_Y_data[:0.9*len(Y_data)] print X_training print print Y_training X_test = X_data[0.9*len(X_data):] Y_test = new_Y_data[0.9*len(Y_data):] #svc = svm.SVC(C=1, kernel='') knn = KNeighborsClassifier() knnr= KNeighborsRegressor(n_neighbors=20000) print knnr.fit(X_training, Y_training).score(X_test,Y_test)
def __init__(self,dataFrame): self.dataFrameKNN = {} self.KNNWeightage = {'Avg-High Ratio':100,'Avg-Low Ratio':100,'Deliverable Qty':300,'Turnover':100,'Growth':150,'Trend':100,'Output':100} self.valid = True self.KNNModelHash = {} self.dataFrameKNN = pd.DataFrame() self.dataFrameKNN['Avg-High Ratio'] = dataFrame['High Price'][1:] - dataFrame['Average Price'][1:] self.dataFrameKNN['Avg-Low Ratio'] = dataFrame['Average Price'][1:] - dataFrame['Low Price'][1:] self.dataFrameKNN['Deliverable Qty'] = dataFrame['Deliverable Qty'][1:] self.dataFrameKNN['Turnover'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Growth'] = dataFrame['Close Price'][1:]-dataFrame['Prev Close'][1:] self.dataFrameKNN['Trend'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Output'] = dataFrame['High Price'][1:]-dataFrame['Prev Close'][1:] self.KNNModelHash['mean'] = self.dataFrameKNN['Output'].mean() self.KNNModelHash['std'] = self.dataFrameKNN['Output'].std() for key in self.dataFrameKNN: self.normalizeKNNModel(key) #trainData has the data to be trained, but the last data is the testData trainData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][:-1].values testData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][-1:].values #trainOutput contains the output corresponding to train Data but the first one is garbage trainOutput = self.dataFrameKNN['Output'][1:].values KNNModel = KNeighborsRegressor(n_neighbors=3,weights = 'distance') KNNModel.fit(trainData[100:400],trainOutput[100:400]) prediction = KNNModel.predict(trainData[400:450]) weightage = self.KNNWeightage['Output'] for i in range(50): prediction[i] = ((prediction[i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage trainOutput[400+i] = ((trainOutput[400+i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage print "%-40s %-40s " %(prediction[i],trainOutput[400+i])
def smooth(self, X, y): # KNN algorithm for smooth nbrs = KNeighborsRegressor(n_neighbors = 20) X = X.reshape(-1, 1) nbrs.fit(X, y) proba = nbrs.predict(X) return proba
def predictKnn(data, priceToPredict): corelationCoefficiantDictionary = {} corelationCoefficiantArray = [] openingPriceTrain, openingPriceTest, closingPriceTrain, closingPriceTest = \ data["openingPriceTrain"], data["openingPriceTest"], data["closingPriceTrain"], data["closingPriceTest"] for k in range( 1 , 100 , 1): neigh = KNeighborsRegressor(n_neighbors=k) #n = 7 best fits neigh.fit(openingPriceTrain, closingPriceTrain) closingPriceTestArray = np.reshape(closingPriceTest,-1) knnpr = neigh.predict(openingPriceTest) predictedArray = np.reshape(knnpr,-1) corelationCoefficient = pearsonr(closingPriceTestArray,predictedArray) corelationCoefficiantDictionary[k] = corelationCoefficient[0] corelationCoefficiantArray.append(corelationCoefficient[0]) plotter.plot(corelationCoefficiantArray) # plotter.show() bestK = max(corelationCoefficiantDictionary, key=corelationCoefficiantDictionary.get) neighBest = KNeighborsRegressor(n_neighbors=bestK) neighBest.fit(openingPriceTrain, closingPriceTrain) openingPriceToPredict = np.array([priceToPredict]) print("K = ") print(bestK) print(neighBest.predict(openingPriceToPredict))
def Round2(X, y): # Set parameters min_score = {} for neigh in [5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]: model = KNeighborsRegressor(n_neighbors=neigh) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['neighbor'] = neigh min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['neighbor'] = neigh min_score['scores'] = scores print "Neighbors:", neigh print scores print np.mean(scores) return min_score
def run_network(mdl=None, data=None): global_start_time = time.time() sequence_length = 10 if data is None: print('Loading data... ') X_train, y_train, X_test, y_test = train_test_traffic_data(15773, sequence_length) else: X_train, y_train, X_test, y_test = data print('\nData Loaded...\n') if mdl is None: mdl = KNeighborsRegressor(5, weights='distance') try: mdl.fit(X_train, y_train) predicted_trffic = mdl.predict(X_test) except KeyboardInterrupt: print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, 0 print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, predicted_trffic
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'n_neighbors': [2, 5, 10, 15]}] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): # pdb.set_trace() X_train, y_train = stock.get_data(start_date, mid_date, fit=True) X_cv, y_cv = stock.get_data(mid_date, end_date) lowest_mse = np.inf for i, param in enumerate(params): knn = KNeighborsRegressor(**param) # ada = AdaBoostRegressor(knn) knn.fit(X_train.values, y_train.values) mse = mean_squared_error(y_cv, knn.predict(X_cv.values)) if mse <= lowest_mse: self.models[ticker] = knn return self
def train(self, x, y, param_names, random_search=100, **kwargs): start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search n_neighbors = self._random_search(random_iter=100, x=scaled_x, y=y) # Now train model knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') knn.fit(scaled_x, y) self._model = knn duration = time.time() - start self._training_finished = True return duration
class ModelNNReg(ScikitPredictor): '''Nearest neighbor regression''' def generate_model(self): self.model = KNeighborsRegressor(**self.model_kwargs) def fit_model(self, x, y): self.model.fit(x, y)
def predictDayType (self,week,day): knn = KNeighborsRegressor(n_neighbors=5) knn.fit(self.rawData, self.dayType) X = np.array([week,day]) predictions = knn.predict(X) return predictions
def nnVerify_2(city_data,x,y): """ Using SKLearn's KNeighborsRegressor """ X,Y = city_data.data, city_data.target clf = KNeighborsRegressor(n_neighbors=2) clf.fit(X,Y) y_pred = clf.predict(x) print("KNeighborsRegressor") print("Y pred(KNN) : ", y_pred)
def calculateKNearestNeighborsModel(data, numberOfNeighbors): # Select input variables as x and typecast to numpy array x = np.array(data.iloc[0:,0:11]) # Select output variable (quality) as y and typecast to numpy array y = np.array(data.quality) neighbors = KNeighborsRegressor(n_neighbors=numberOfNeighbors) neighbors.fit(x, y) return neighbors
def main(): # read the images image_from = io.imread(name_from) / 256 image_to = io.imread(name_to) / 256 # change to hsv domain (if requested) if args.use_hsv: image_from[:] = rgb2hsv(image_from) image_to[:] = rgb2hsv(image_to) # get shapes shape_from = image_from.shape shape_to = image_to.shape # flatten X_from = im2mat(image_from) X_to = im2mat(image_to) # number of pixes n_pixels_from = X_from.shape[0] n_pixels_to = X_to.shape[0] # subsample X_from_ss = X_from[np.random.randint(0, n_pixels_from-1, n_pixels),:] X_to_ss = X_to[np.random.randint(0, n_pixels_to-1, n_pixels),:] if save_col_distribution: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') fig, axes = plt.subplots(nrows=2, figsize=(5, 10)) for ax, X in zip(axes, [X_from_ss, X_to_ss]): ax.scatter(X[:,0], X[:,1], color=X) if args.use_hsv: ax.set_xhsvel('hue') ax.set_yhsvel('value') else: ax.set_xhsvel('red') ax.set_yhsvel('green') axes[0].set_title('distr. from') axes[1].set_title('distr. to') fig.tight_layout() fig.savefig('color_distributions.png') # optimal tranportation ot_color = OptimalTransport(X_to_ss, X_from_ss, lam=lam, distance_metric=distance_metric) # model transfer transfer_model = KNeighborsRegressor(n_neighbors=n_neighbors) transfer_model.fit(X_to_ss, n_pixels * ot_color.P @ X_from_ss) X_transfered = transfer_model.predict(X_to) image_transferd = minmax(mat2im(X_transfered, shape_to)) if args.use_hsv: image_transferd[:] = hsv2rgb(image_transferd) io.imsave(name_out, image_transferd)
class Knn(ContextEngineBase): y_Test = np.empty([0]) # Knn object knnRegressor = None def __init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict): ContextEngineBase.__init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict) # Passed parameters self.n_neighbors = appFieldsDict['n_neighbors'] self.weights = appFieldsDict['weights'] self.algorithm = appFieldsDict['algorithm'] self.n_jobs = appFieldsDict['n_jobs'] # Defining a Knn object with given parameters self.knnRegressor = KNeighborsRegressor(n_neighbors = self.n_neighbors, weights = self.weights, algorithm = self.algorithm, n_jobs = self.n_jobs) # Add a set of training observations, with the newInputObsMatrix being a # matrix of doubles, where the row magnitude must match the number of inputs, # and the column magnitude must match the number of observations. # and newOutputVector being a column vector of doubles def addBatchObservations(self, newInputObsMatrix, newOutputVector): if(len(newInputObsMatrix.shape) == 2 and newInputObsMatrix.shape[1] == self.numInputs and newOutputVector.shape[0] == newInputObsMatrix.shape[0]): # print("All good!") newOutputVector = newOutputVector.ravel() i = 0 for newInputVector in newInputObsMatrix: newOutputValue = newOutputVector[i] self.addSingleObservation(newInputVector, newOutputValue) i += 1 else: print("Wrong dimensions!") # Train the coefficients on the existing observation matrix if there are # enough observations. def train(self): if (self.numObservations > 0): # print("Training started") self.knnRegressor.fit(self.observationMatrix, self.outputVector) return True else: print("Not enough observations to train!") return False # Execute the trained matrix against the given input observation # inputObsVector is a row vector of doubles def execute(self, inputObsVector): if(len(inputObsVector) == self.numInputs): # print("Begin execute") #x_Test = np.vstack((self.x_Test,inputObsVector)) x_Test = np.reshape(inputObsVector,(1,self.numInputs)) self.y_Test = self.knnRegressor.predict(x_Test) return self.y_Test[0] else: print("Wrong dimensions, fail to execute") return None
def knn_regressor(features, solutions, verbose=0): columns = solutions.columns clf = KNeighborsRegressor(n_neighbors=5, weights='distance') print('Training Model... ') clf.fit(features, solutions) print('Done Training') return (clf, columns)
def impute_KNN(df,var,features,k,): var_imputer = KNeighborsRegressor(n_neighbors=k) df_full = df[df[var].isnull()==False] df_null = df[df[var].isnull()==True] var_imputer.fit(df_full[features], df_full[var]) impute = var_imputer.predict(df_null[features]) df_null[var] = impute df = df_full.append(df_null) return df
class kNN(): ''' kNN classifier ------------- ''' def __init__(self,N_i,N_o,k=5,n=20): # note: N_o=1 assumed for now self.N_i = N_i self.n = n self.i = 0 self.k = k self.X = zeros((self.n,N_i)) self.y = zeros((self.n)) self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance') self.c = 0 #self.error_rate = 0 def predict(self,x): ''' Predict -------------- ''' if self.c < 1.: print "[Warning!] No training examples!" return 0.0 elif self.c <= self.k: dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1) i_max = argmax(ind) return self.y[i_max] return self.h.predict(x)#.reshape(1,-1)) # def samples_X(self): # ''' return samples of the WEIGHTS ''' # if self.c <= 0: # return self.X[0,:] # return self.X[0:self.c,:] def update(self, x, y): ''' Update -------------- ''' self.X[self.i,:] = x self.y[self.i] = y #self.error_rate = (y - self.predict(x))**2 self.i = (self.i + 1) % self.n if self.c < self.n: self.c = self.c + 1 self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
def nearest_neighbors_impute(df, coordinate_columns, data_columns, knr_params={}): from sklearn.neighbors import KNeighborsRegressor for column in data_columns: not_null = df[column].notnull() if (~not_null).sum() == 0: continue knr = KNeighborsRegressor(**knr_params) knr.fit(df.loc[not_null,coordinate_columns], df.loc[not_null,[column]]) predicted = knr.predict(df.loc[~not_null,coordinate_columns]) df.loc[ (~not_null),[column]] = predicted
def addJKRegionLabels(self): data = zip(self.data['RA'],self.data['DEC']) randoms = zip(self.randoms['RA'],self.randoms['DEC']) finder = KMeans(n_clusters=self.config['n_jackknife']) self.data_jk_indices = finder.fit_predict(data) nbrs = KNeighborsRegressor(n_neighbors=1) nbrs.fit(data,self.data_jk_indices) self.random_jk_indices = nbrs.predict(randoms)
def knn(X, Y): neigh = KNeighborsRegressor() neigh.fit(X, Y) def explore(x): score = -1 * neigh.predict([x]) return score minimized = differential_evolution(explore, ((0, 1), (0, 1), (0, 1), (0, 1), (0, 1))) return { 'X_min': list(minimized.x), 'score': neigh.score(X, Y) }
'''------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------''' '''fit knn to the model''' from sklearn.neighbors import KNeighborsRegressor #KNeighborsRegressor if linear regression knn = KNeighborsRegressor() '''find the optimal parameters in KNN''' param_dict = { 'n_neighbors': [5,10,15], 'weights': ['uniform', 'distance' ], 'p' :[1, 2] } from sklearn.model_selection import GridSearchCV knn = GridSearchCV(knn,param_dict) knn.fit(X_train,y_train) knn.best_params_ knn.best_score_ '''refit knn to the model with optimal parameters''' knn=KNeighborsRegressor(n_neighbors= 15, p=1, weights='uniform') knn.fit(X_train,y_train) #predictions for test y_pred2 = knn.predict(X_test) '''R2 and adjusted R2, and rmse''' r2=knn.score(X_test,y_test) print(r2) adj_r2 = 1 - ((len(X_test)-1)/(len(X_test)-i-1))*(1-knn.score(X_test, y_test))
data = data.swapaxes(0,1) y.shape = (1,-1) y = y.swapaxes(0,1) print('sample dataset is generated') X = data X[isnan(X)]=0 y[isnan(y)]=0 Xp = cube2[:].reshape(cube2.shape[0],-1).swapaxes(0,1) from sklearn.neighbors import KNeighborsRegressor estimator = KNeighborsRegressor(weights='distance') estimator.fit(X, y) y_pred = estimator.predict(Xp) dvf = ma.array(y_pred).reshape(128,128,1) dvf.mask=dvf < 5e3 figure() subplot(121) imshow(dvf[:,:,0],cmap='seismic',vmin=dvf.min(),vmax=dvf.max()) contour(cube.max(0),levels=linspace(0.2,40,12)) subplot(122) imshow(dvf[:,:,1],cmap='seismic',vmin=dvf.min(),vmax=dvf.max()) contour(cube.max(0),levels=linspace(0.2,40,12)) y = y.reshape(y.shape) B2 = linspace(0,37,n_grid)
import numpy as np from sklearn.svm import SVR from sklearn.decomposition import PCA from sklearn.preprocessing import RobustScaler from sklearn.pipeline import make_pipeline train = pd.read_csv('mercedes_train.csv') test = pd.read_csv('mercedes_test.csv') y_train = train['y'].values id_test = test['ID'] num_train = len(train) df_all = pd.concat([train, test]) df_all.drop(['ID', 'y'], axis=1, inplace=True) # One-hot encoding of categorical/strings df_all = pd.get_dummies(df_all, drop_first=True) train = df_all[:num_train] test = df_all[num_train:] from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=5) knn.fit(train, y_train) df_sub = pd.DataFrame({'ID': id_test, 'y': knn.predict(test)}) df_sub.to_csv('mercedes-submission.csv', index=False)
#y_train = ss_y.fit_transform(y_train) #y_test = ss_y.fit_transform(y_test) # 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)。 from sklearn.neighbors import KNeighborsRegressor mse1 = [] mse2 = [] mae1 = [] mae2 = [] r21 = [] r22 = [] for i in range(4, 40): n_neighbors = i # 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'。 uni_knr = KNeighborsRegressor(weights='uniform', n_neighbors=n_neighbors) uni_knr.fit(X_train, y_train) uni_knr_y_predict = uni_knr.predict(X_test) # 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'。 dis_knr = KNeighborsRegressor(weights='distance', n_neighbors=n_neighbors) dis_knr.fit(X_train, y_train) dis_knr_y_predict = dis_knr.predict(X_test) from sklearn.metrics import mean_absolute_error, mean_squared_error # 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。 mse1.append(mean_squared_error(y_test, uni_knr_y_predict)) mse2.append(mean_squared_error(y_test, dis_knr_y_predict)) print('R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, y_test)) #此处是R^2 print('The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(y_test, uni_knr_y_predict))
sdpval = str(float(sd[1])) open( "/home/paul/mesa_models/python_ml_models/results/" + train_pop + "_2_" + pop + "_svr_rbf_cor_test_chr" + str(chrom) + ".txt", "a").write(gene + "\t" + gene_name + "\t" + pacoef + "\t" + papval + "\t" + pbcoef + "\t" + pbpval + "\t" + sccoef + "\t" + scpval + "\t" + sdcoef + "\t" + sdpval + "\n") #KNN #knn_t0 = time.time()#time it #knn_cv = str(float(mean(cross_val_score(knn, cis_gt, adj_exp.ravel(), cv=5)))) #knn_t1 = time.time() #knn_tt = str(float(knn_t1 - knn_t0)) knn.fit(cis_gt, adj_exp.ravel()) ypred = knn.predict(test_cis_gt) #prepare ypred for writing out to a file yprep_pd = pd.DataFrame(ypred) ypred_pd.columns = gg ypred_pd.index = test_ids ypred_frame_knn = pd.concat([ypred_frame_knn, ypred_pd], axis=1, sort=True) pa = stats.pearsonr(test_adj_exp, ypred) pacoef = str(float(pa[0])) papval = str(float(pa[1])) pb = stats.pearsonr(test_yobs, ypred)
from open3d import read_point_cloud, write_point_cloud, Vector3dVector from sklearn.neighbors import KNeighborsRegressor from numpy import array, concatenate cloud = read_point_cloud('tree.ply') calibrate = read_point_cloud('photo_test.ply') neigh0 = KNeighborsRegressor(4, 'distance', n_jobs=-1) neigh0.fit(calibrate.points, calibrate.colors) arr_points = array(cloud.points) arr_colors = array(cloud.colors) arr_filter = array(cloud.normals)[:, 0] * (arr_colors[:, 2] > 0.5) points_other = arr_points.compress(True - arr_filter, 0) colors_other = arr_colors.compress(True - arr_filter, 0) neigh1 = KNeighborsRegressor(1, n_jobs=-1) neigh1.fit(points_other, colors_other) points_abn = arr_points.compress(arr_filter, 0) colors_abn = (neigh0.predict(points_abn) + neigh1.predict(points_abn)) / 2 cloud.points = Vector3dVector(concatenate((points_other, points_abn))) cloud.colors = Vector3dVector(concatenate((colors_other, colors_abn))) cloud.normals = Vector3dVector() write_point_cloud('corr.ply', cloud)
def task2(data): df = data dfreg = df.loc[:,['Adj Close','Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) # 1 - First save the models to local device in models folder # filename = 'models/clfreg_model.sav' # pickle.dump(clfreg, open(filename, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfreg = pickle.load(open(filename, 'rb')) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) #Save model to a pickle # filename1 = 'models/clfpoly2_model.sav' # pickle.dump(clfpoly2, open(filename1, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly2 = pickle.load(open(filename1, 'rb')) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) #Save model to a pickle # filename2 = 'models/clfpoly3_model.sav' # pickle.dump(clfpoly3, open(filename2, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly3 = pickle.load(open(filename2, 'rb')) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) #Save model to a pickle # filename3 = 'models/clfknn_model.sav' # pickle.dump(clfknn, open(filename3, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfknn = pickle.load(open(filename3, 'rb')) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() #Save model to a pickle # filename4 = 'models/clflas_model.sav' # pickle.dump(clflas, open(filename4, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflas = pickle.load(open(filename4, 'rb')) clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) #Save model to a pickle # filename5 = 'models/clfbyr_model.sav' # pickle.dump(clfbyr, open(filename5, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfbyr = pickle.load(open(filename5, 'rb')) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) #Save model to a pickle # filename6 = 'models/clflar_model.sav' # pickle.dump(clflar, open(filename6, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflar = pickle.load(open(filename6, 'rb')) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) #Save model to a pickle # filename7 = 'models/clfomp_model.sav' # pickle.dump(clfomp, open(filename7, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfomp = pickle.load(open(filename7, 'rb')) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) #Save model to a pickle # filename8 = 'models/clfard_model.sav' # pickle.dump(clfard, open(filename8, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfard = pickle.load(open(filename8, 'rb')) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) #SGD Regression clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) #Save model to a pickle # filename9 = 'models/clfsgd_model.sav' # pickle.dump(clfsgd, open(filename9, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfsgd = pickle.load(open(filename9, 'rb')) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test,y_test) confidencepoly3 = clfpoly3.score(X_test,y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:',confidencereg*100) print('The quadratic regression 2 confidence is:',confidencepoly2*100) print('The quadratic regression 3 confidence is:',confidencepoly3*100) print('The knn regression confidence is:',confidenceknn*100) print('The lasso regression confidence is:',confidencelas*100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:',confidencebyr*100) print('The Lasso LARS regression confidence is:',confidencelar*100) print('The OMP regression confidence is:',confidenceomp*100) print('The ARD regression confidence is:',confidenceard*100) print('The SGD regression confidence is:',confidencesgd*100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), dfreg['Adj Close'].to_list(), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
def regression_subset(predictions, train, test, method): mean_error = [] if (method == 1): machine_learn = KNeighborsRegressor(n_neighbors=5, weights='distance') elif (method == 2): machine_learn = MLPRegressor(random_state=0) #for each building for i in range(3): new_train = train.loc[ train['BUILDINGID'] == i] #select for training only buildings with that label (0,1, or 2) indexes = [x for x in range(len(predictions)) if predictions[x] == i ] #get the position of the samples that have building == i if (indexes): #if list is not empty #training, samples with building == i X_train = new_train.ix[:, 0:519] Y_train = new_train[['LONGITUDE', 'LATITUDE']] machine_learn.fit(X_train, Y_train) #testing samples with prediction building == i new_test = test.iloc[indexes, :] X_test = new_test.ix[:, 0:519] Y_test = new_test[['LONGITUDE', 'LATITUDE']] #Turn into list predicts_lon_lat = machine_learn.predict(X_test).tolist() Y_test = Y_test.values.tolist() distance = [] for j in range(len(predicts_lon_lat)): #change the latitude and longitude unit myProj = Proj( "+proj=utm +zone=23K, +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs" ) lon_pred, lat_pred = myProj(predicts_lon_lat[j][0], predicts_lon_lat[j][1], inverse=True) lon_Y, lat_Y = myProj(Y_test[j][0], Y_test[j][1], inverse=True) #join in a unique list Y = [] Y.append(lon_Y) Y.append(lat_Y) predict = [] predict.append(lon_pred) predict.append(lat_pred) #The distance between the two latitudes is the error distance.append(vincenty(Y, predict).meters) #If you want to use haversine distance, uncomment the line below #print haversine(lon_Y, lat_Y, lon_pred, lat_pred) mean_error.append(np.mean(distance)) #print(np.mean(distance)) return np.mean(mean_error)
def train_knn(self, data, target, n_neighbors): model = KNeighborsRegressor(n_neighbors) model.fit(data, target) return model
def test_knn_imputer_weight_distance(na): X = np.array([ [0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with "distance" weight nn = KNeighborsRegressor(metric="euclidean", weights="distance") X_rows_idx = [0, 2, 3, 4, 5, 6] nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) knn_imputed_value = nn.predict(X[1:2, 1:])[0] # Manual calculation X_neighbors_idx = [0, 2, 3, 4, 5] dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) weights = 1 / dist[:, X_neighbors_idx].ravel() manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) X_imputed_distance1 = np.array([ [0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # NearestNeighbor calculation X_imputed_distance2 = np.array([ [0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_distance1) assert_allclose(imputer.fit_transform(X), X_imputed_distance2) # Test with weights = "distance" and n_neighbors=2 X = np.array([ [na, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) # neighbors are rows 1, 2, the nan_euclidean_distances are: dist_0_1 = np.sqrt((3/2)*((1 - 0)**2 + (2 - 0)**2)) dist_0_2 = np.sqrt((3/2)*((2 - 0)**2 + (3 - 0)**2)) imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) X_imputed = np.array([ [imputed_value, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test with varying missingness patterns X = np.array([ [1, 0, 0, 1], [0, na, 1, na], [1, 1, 1, na], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) # Get weights of donor neighbors dist = nan_euclidean_distances(X, missing_values=na) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = 1 / r1c1_nbor_dists r1c3_nbor_wt = 1 / r1c3_nbor_dists r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1 / r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) X = np.array([ [0, 0, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7] ]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7] ]) imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def knnRegressor(train_data, train_targets, test_data, k): classifier = KNeighborsRegressor(k) classifier.fit(train_data, train_targets) predictions = classifier.predict(test_data) return predictions
y_train = y # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) X_train.shape y_train.shape X_test.shape y_test.shape X_test = X[-forecast_out:] y_test = y[-forecast_out:] confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test)
def coord_regression(predictions_b,predictions,train,test,method): mean_error = [] if(method==1): machine_learn = KNeighborsRegressor(n_neighbors=5, weights = 'distance') elif(method==2): #machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5) machine_learn = MLPClassifier(solver='sgd',learning_rate = 'adaptive',verbose='false',activation='tanh',alpha=1e-5,max_iter=400) #THE BEST #machine_learn = MLPClassifier(hidden_layer_sizes=(100,5), solver='sgd',learning_rate = 'adaptive',verbose='true',activation='tanh',alpha=1e-5,max_iter=500) #model = MLPClassifier(learning_rate = 'adaptive') #solvers = ['lbfgs', 'sgd', 'adam'] #activations = ['identity', 'logistic', 'tanh', 'relu'] #max_its = [200,400,600] #machine_learn = GridSearchCV(estimator=model, param_grid=dict(activation =activations,max_iter=max_its),n_jobs=7) #GRID #for each building for j in range(3): new_train1 = train.loc[train['BUILDINGID'] == j] #select for training only buildings with that label (0,1, or 2) ind = [x for x in range(len(predictions_b)) if predictions_b[x]==j] #get the position of the samples that have building == i new_test1 = test.iloc[ind,:] if(ind): #for each floor for i in range(5): new_train2 = new_train1.loc[new_train1['FLOOR'] == i] if(not new_train2.empty): indexes = [x for x in range(len(predictions)) if (predictions[x]==i and predictions_b[x]==j)] #get the position of the samples that have building == i else: index = [] if (indexes): #if list is not empty X_train = new_train2.ix[:,0:519] Y_train = new_train2[['LONGITUDE','LATITUDE']] machine_learn.fit(X_train,Y_train) #testing samples with prediction building == i new_test2 = test.iloc[indexes,:] X_test = new_test2.ix[:,0:519] Y_test = new_test2[['LONGITUDE','LATITUDE']] #Turn into list predicts_lon_lat = machine_learn.predict(X_test).tolist() Y_test = Y_test.values.tolist() distance = [] for j in range(len(predicts_lon_lat)): #change the latitude and longitude unit myProj = Proj("+proj=utm +zone=23K, +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs") lon_pred,lat_pred = myProj(predicts_lon_lat[j][0], predicts_lon_lat[j][1], inverse=True) lon_Y, lat_Y = myProj(Y_test[j][0], Y_test[j][1], inverse=True) #join in a unique list Y = [] Y.append(lon_Y) Y.append(lat_Y) predict = [] predict.append(lon_pred) predict.append(lat_pred) #The distance between the two latitudes is the error distance.append(vincenty(Y, predict).meters) print "distance" print distance #If you want to use haversine distance, uncomment the line below #print haversine(lon_Y, lat_Y, lon_pred, lat_pred) mean_error.append(np.mean(distance)) #print(np.mean(distance)) return np.mean(mean_error)
# interpolate to form time-interval representation X = [] y = [] for ii in range(len(ts) - 1): X.append(ts[ii]) y.append(ps[ii]) X.append(ts[ii + 1] - delta) y.append(ps[ii]) X = np.asarray(X) y = np.asarray(y) from sklearn.neighbors import KNeighborsRegressor interpolator = KNeighborsRegressor(1) interpolator.fit(X.reshape(-1, 1), y) X = np.arange(0, np.max(ts) + 2 * Fs, 1.0 / Fs) # add a little buffer TIntRep = interpolator.predict(X.reshape(-1, 1)) # now break up time-interval representation into parts numparts = np.floor(len(TIntRep) / (Fs * 6 * 8)) TIntRep = TIntRep[0:int(numparts * Fs * 6 * 8)] TIntRep_re = TIntRep.reshape((int(numparts), int(Fs * 6 * 8))) cumsumfs = np.cumsum(TIntRep_re, axis=1) / Fs cumsumfs_meancentered = cumsumfs.T - cumsumfs.mean(axis=1) FX = np.fft.fft(TIntRep_re / Fs) cauto = np.fft.ifft(FX * FX.conj()).real TIHist = np.zeros((int(numparts), len(binsforhistogram) - 1)) for ii in range(int(numparts)):
##################################################################### ##################################################################### ## H2O ## ##################################################################### ##################################################################### # Nearest Neighbors # ##################################################################### ##################################################################### ## Scikit Learn ## ##################################################################### knn_model = KNeighborsRegressor() knn_model.fit(x_train_values, y_train_values) knn_model_predictions = knn_model.predict(x_test_values) generate_submission_file(knn_model_predictions, test_data["Id"], "../results/" + user + "_KNN.csv") param_list = {"n_neighbors": [2, 4, 6]} knn_gridsearch = GridSearchCV(KNeighborsRegressor(), param_list) knn_gridsearch.fit(x_train_values, y_train_values) knn_best_model_predictions = knn_gridsearch.best_estimator_.predict( x_test_values) generate_submission_file(knn_best_model_predictions, test_data["Id"], "../results/" + user + "_KNN_GridSearchCV.csv") ##################################################################### # Decision Trees # #####################################################################
x1 = np.reshape(x1, (506, 1)) X = np.asarray(bos['AGE']) x2 = np.reshape(X, (506, 1)) #X_train= np.concatenate(x1,x2) X_train = np.asarray([x1, x2]) X_train = np.reshape(X_train, (506, 2)) xnew = np.concatenate((x1, x2), axis=1) lm.fit(xnew, y) lm.predict([6.5, 80]) lm.coef_ lm.intercept_ lm.score(new, y) #%% K Nearest Neighbors from sklearn.neighbors import KNeighborsRegressor kreg = KNeighborsRegressor() kreg.fit(xnew, y) kreg.score(xnew, y)
N = 300 X = np.expand_dims(np.linspace(0, np.pi * 2, N), axis=1).astype(np.float32) y = np.sin(X).astype(np.float32) X += np.random.normal(0, 0.1, X.shape) y += np.random.normal(0, 0.1, y.shape) sampleN = 100 samples = np.expand_dims(np.linspace(0, np.pi * 2, sampleN), axis=1).astype(np.float32) n_neighbors = 20 # numpy version neigh = KNeighborsRegressor(n_neighbors=n_neighbors) #, weights='distance') neigh.fit(X, y) y_pred1 = neigh.predict(samples) save('result1.json', X.reshape(N), y.reshape(N), sampleN, samples.reshape(sampleN), y_pred1.reshape(sampleN)) # PANENE version neigh = KNNRegressor(X, y, n_neighbors=n_neighbors) #, weights='distance') y_pred2 = neigh.predict(samples) save('result2.json', X.reshape(N), y.reshape(N), sampleN, samples.reshape(sampleN), y_pred2.reshape(sampleN))
def build_k_nearest_neighbours_model(): print("\n--- CREATING K NEAREST NEIGHBOURS REGRESSOR MODEL ---") # Creating a K Nearest Neighbours Regressor Model knn_model = KNeighborsRegressor() knn_model.fit(train_X, train_y) # Model Validation test_pred = knn_model.predict(test_X) mae = mean_absolute_error(test_pred, test_y) r2 = r2_score(test_pred, test_y) print("\n" + "Mean Absolute Error:", mae) print("R2 Score:", r2) # Outputting first couple of rows print(test_pred[:5]) print(test_y[:5]) # Model Improvement # Creating discrete hyperparameter amounts to trial print("\n--- BEGINNING MODEL IMPROVEMENTS ---") n_neighbors = [1, 2, 3, 5, 10, 15, 50, 100, 1000] leaf_sizes = [2, 5, 10, 30, 50, 100] algorithms = ["auto", "ball_tree", "kd_tree", "brute"] print("\n--- ADJUSTING N NEIGHBOURS ---") best_n_neighbors_data = [0, 0] for n_neighbor in n_neighbors: knn_model = KNeighborsRegressor(n_neighbors=n_neighbor) knn_model.fit(train_X, train_y) preds = knn_model.predict(test_X) score = r2_score(preds, test_y) print("\nN Neighbors:", n_neighbor) print("R2 Score:", score) if score > best_n_neighbors_data[1]: best_n_neighbors_data = [n_neighbor, score] best_n_neighbors = best_n_neighbors_data[0] print("\nOptimal amount of n neighbours:", best_n_neighbors) print("\n--- ADJUSTING LEAF SIZE ---") best_leaf_size_data = [0, 0] for leaf_size in leaf_sizes: knn_model = KNeighborsRegressor(leaf_size=leaf_size) knn_model.fit(train_X, train_y) preds = knn_model.predict(test_X) score = r2_score(preds, test_y) print("\nLeaf size:", leaf_size) print("R2 Score:", score) if score > best_leaf_size_data[1]: best_leaf_size_data = [leaf_size, score] best_leaf_size = best_leaf_size_data[0] print("\nOptimal leaf size:", best_leaf_size) print("\n--- FINDING OPTIMAL ALGORITHM ---") best_algorithm_data = ["", 0] for algorithm in algorithms: knn_model = KNeighborsRegressor(algorithm=algorithm) knn_model.fit(train_X, train_y) preds = knn_model.predict(test_X) score = r2_score(preds, test_y) print("\nAlgorithm:", algorithm) print("R2 Score:", score) if score > best_algorithm_data[1]: best_algorithm_data = [algorithm, score] best_algorithm = best_algorithm_data[0] print("\nOptimal algorithm:", best_algorithm) print("---CREATING FINAL MODEL ---") model = KNeighborsRegressor(n_neighbors=best_n_neighbors, leaf_size=best_leaf_size, algorithm=best_algorithm) model.fit(train_X, train_y) preds = model.predict(test_X) score = r2_score(preds, test_y) print("R2 Score:", score)
import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor N = 200 X = np.linspace(0, 10, N).reshape(N, 1) Y = np.sin(X) Ntrain = 20 idx = np.random.choice(N, Ntrain) Xtrain = X[idx] Ytrain = Y[idx] kneigh = KNeighborsRegressor(n_neighbors=2, weights='distance') kneigh.fit(Xtrain, Ytrain) Yhat = kneigh.predict(X) deci = DecisionTreeRegressor() deci.fit(Xtrain, Ytrain) YhatDeci = deci.predict(X) plt.scatter(Xtrain, Ytrain) plt.plot(X, Y) plt.plot(X, Yhat, label="Knn Regressor") plt.plot(X, YhatDeci, label="DecisionTree Regressor") plt.legend() plt.show() plt.savefig("images/KnnDistAndDecisionTreeSklearn")
gildong = LinearRegression() gildong.fit(train_df_part1[features], train_df_part1['price']) score = gildong.score(train_df_part2[features], train_df_part2['price']) print(format(score, '.3f')) # ### 3.4 K-NN 알고리즘 이용하여 예측하기 # > **이제까지는 선형회귀 모델을 이용하여 주택 가격을 예측해 보았습니다. 이제부터는 몇 가지 다른 모델을 이용하여 시험해 보겠습니다. 먼저 K-근접 이웃 방법입니다.** # In[ ]: from sklearn.neighbors import KNeighborsRegressor babo = KNeighborsRegressor(n_neighbors=10) babo.fit(train_df_part1[features], train_df_part1['price']) score = babo.score(train_df_part2[features], train_df_part2['price']) print(format(score, '.3f')) # ### 3.5 결정트리 알고리즘 이용하여 예측하기 # > **결정트리 모델입니다. 점수와 예측한 주택 가격을 출력해 보았습니다.** # In[ ]: youngja = DecisionTreeRegressor(random_state=0) youngja.fit(train_df_part1[features], train_df_part1['price']) score = youngja.score(train_df_part2[features], train_df_part2['price']) print(format(score, '.3f')) predicted = youngja.predict(train_df_part2[features])
""" KNN regression model author: Alsu Vakhitova """ import training.datahelper as datahelper import numpy as np from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) for n in range(1, 10): print("\nNumber of neighbors: ", n) neigh = KNeighborsRegressor(n_neighbors=n) neigh.fit(x_train, y_train) print("R^2 for all measurements: ", neigh.score(x_test, y_test), '\n') m = mean_absolute_error(y_test, neigh.predict(x_test), multioutput='raw_values') print('Average mean absolute error: ', np.average(m)) print("Mean absolute error for measurements:") for col, err in zip(list(x.columns.values), m): print(col, ": ", err)
case_training_data[:, 1][i]) + "\n" fd.write(writeRow) fd.close() #train model import pandas as pd from matplotlib import pyplot as plt from sklearn.neighbors import KNeighborsRegressor df = pd.read_csv('Case_01_training.csv') X = df.loc[:, 'YbyH':'dummy3'] Y = df.loc[:, 'UbyVw'] knn = KNeighborsRegressor() knn.fit(X, Y) #test model case_test_data = case_test_data = np.loadtxt('Case_01_sim_test.dat') X_test = np.zeros((case_test_data.shape[0], 7)) #X_test[0,:]=0.0,case_VW,case_H,case_DPDX,case_d1,case_d2,case_d3 for i in range(0, case_test_data.shape[0]): X_test[i, :] = case_test_data[:, 0][ i], case_VW, case_H, case_DPDX, case_d1, case_d2, case_d3 Y_predicted = knn.predict(X_test) #print Y_predicted plt.plot(case_test_data[:, 0], case_test_data[:, 1], '-b', label='sim')
X_train.append(streamline_moving) Y_train.append(streamline_fixed - streamline_moving) # X_train[i*N_points:(i+1)*N_points, :] = streamline_moving # Y_train[i*N_points:(i+1)*N_points, :] = streamline_fixed - streamline_moving X_train = np.vstack(X_train) Y_train = np.vstack(Y_train) #%% KNeighborsRegressor print("Performs KNeighborsRegressor...") neigh = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1, weights='distance') #neigh = RadiusNeighborsRegressor(n_jobs=1, radius=radius, weights='uniform') neigh.fit(X_train, Y_train) #%% Meshgrid suffix = suffix + "_nn" + str(n_neighbors) #X_offset=aff_moving[:,-1][0]*aff_moving[0,0] #Y_offset=aff_moving[:,-1][1]*aff_moving[1,1] #Z_offset=aff_moving[:,-1][2]*aff_moving[2,2] step = 1 X_range = np.arange(0, X_grid_size, step).astype(int) #+X_offset Y_range = np.arange(0, Y_grid_size, step).astype(int) #+Y_offset Z_range = np.arange(0, Z_grid_size, step).astype(int) #+Z_offset #Y_range=np.arange(0,X_grid_size,step)#+X_offset XX, YY, ZZ = np.meshgrid(X_range, Y_range, Z_range)
### LONGITUDE, LATITUDE and FLOOR ## Define models modelkNN_long = KNeighborsRegressor(n_neighbors=7, weights='distance') modelRF_long = RandomForestRegressor() modelkNN_lat = KNeighborsRegressor(n_neighbors=2, weights='distance') modelRF_lat = RandomForestRegressor() modelkNN_floor = KNeighborsClassifier(n_neighbors=2, weights='distance') modelRF_floor = RandomForestClassifier() ## Fit models modelkNN_long.fit(X_train_normObs_casc, y_train_long) modelRF_long.fit(X_train_normObs_casc, y_train_long) modelkNN_lat.fit(X_train_normObs_casc, y_train_lat) modelRF_lat.fit(X_train_normObs_casc, y_train_lat) modelkNN_floor.fit(X_train_normObs_casc, y_train_floor) modelRF_floor.fit(X_train_normObs_casc, y_train_floor) ## Make predictions pred_long_kNN = modelkNN_long.predict(X_val_normObs_casc) pred_long_RF = modelRF_long.predict(X_val_normObs_casc) pred_lat_kNN = modelkNN_lat.predict(X_val_normObs_casc) pred_lat_RF = modelRF_lat.predict(X_val_normObs_casc)
for idx, i in enumerate(pca_X_test): #加入情感特征 pca_X_test[idx].append(float(emotion_test[idx][0])) pca_X_test[idx].append(float(emotion_test[idx][1])) pca_X_test[idx].append(float(emotion_train[idx][2])) #加入公司特征 pca_X_test[idx].append(float(firm_test[idx][0])) pca_X_test[idx].append(float(firm_test[idx][1])) pca_X_test[idx].append(float(firm_test[idx][2])) pca_X_test[idx].append(float(firm_test[idx][3])) pca_X_test[idx].append(float(firm_test[idx][4])) pca_X_test[idx].append(float(firm_test[idx][5])) '''KNN回归''' uni_knr = KNeighborsRegressor( weights='distance') #uniform平均回归,distance是根据距离加权回归 uni_knr.fit(pca_X_train, y_train) uni_knr_y_pred = uni_knr.predict(pca_X_test) print 'KNR的R方', uni_knr.score(pca_X_test, y_test) final_pred_knr.append(uni_knr_y_pred[0]) for i in final_pred_knr: if i >= 0: di_pred_knr.append(1) else: di_pred_knr.append(0) print 'KNR的报告:' print classification_report(di_pred_knr, di_real_test) '''输出预测文本,哪里需要,把这段代码粘哪''' f = open( r'E:\study\master of TJU\0Subject research\code\Important\5_1_mock_trading\pred_result.txt', 'w') testlen = len(date_list) - ori_split_num for i in range(0, testlen):
clock = pygame.time.Clock() # file to take data #sample = open("game.csv", "w") #print("x,y,vx,vy,paddle.y", file=sample) pong = pd.read_csv('game.csv') pong = pong.drop_duplicates() X = pong.drop(columns="paddle.y") y = pong['paddle.y'] from sklearn.neighbors import KNeighborsRegressor clf = KNeighborsRegressor(n_neighbors=3) clf.fit(X,y) df = pd.DataFrame(columns=['x', 'y', 'vx', 'vy']) while True: e = pygame.event.poll() if e.type == pygame.QUIT: break clock.tick(FRAMERATE) pygame.display.flip() toPredict = df.append({'x': ball.x, 'y': ball.y, 'vx': ball.vx, 'vy': ball.vy,}, ignore_index=True)
]] housing_not_missing = housing[housing.total_bedrooms.notna()] X_train = housing_not_missing[[ 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'median_house_value', '1h_ocean', 'island', 'inland', 'near_ocean', 'near_bay' ]] # data (X) -> longitude, latitude, total_rooms, population, households, median_house_value y_train = housing_not_missing[[ 'total_bedrooms' ]] # label (y) -> total_bedrooms (column we want to predict) regressor = KNeighborsRegressor(n_neighbors=5) regressor.fit(X_train, y_train) predicted_values = [] for index in range(0, len(housing_missing)): values = housing_missing.iloc[index].tolist() y_pred = regressor.predict([values]) predicted_values.append(y_pred[0][0]) housing.loc[housing.total_bedrooms.isna(), 'total_bedrooms'] = predicted_values print("\n3. replace with values from nearest neighbour") print("--------------------------------") knn_model = regression(housing) knn_model['name'] = 'replaced missing values with KNN' # 4. use regression with the values in the total_rooms column as prior knowledge
X = df_dummies.loc[:, df_dummies.columns !='Item_Outlet_Sales'].values y = df_dummies.loc[:, 'Item_Outlet_Sales'].values reg = LinearRegression(fit_intercept=True) reg.fit(X,y) y_result=reg.predict(X) score = reg.score(X, y) print('Linear Score: ',score) # # Method 2 ------------------KNN Regression: X=df_dummies.loc[:,df_dummies.columns !='Item_Outlet_Sales'] # print(X.shape) y=df_dummies['Item_Outlet_Sales'] # print(y.shape) knn = KNeighborsRegressor(n_neighbors=3) knn.fit(X, y) predict = knn.predict(X) score=knn.score(X,y) print('KNN Score: ',score) # print('KNN Model fits better than the Linear Regression ') # Method 3 -------------------- # #instantiate the Random Forest Regressor Model X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42) clf = RandomForestRegressor(n_estimators=500,max_depth=5,min_samples_split=3,bootstrap=True,oob_score=True) clf.fit(X_train,y_train)
#Array of non-linear regressors nonLRegression = [] for i in range(numberOfLS): # Generate a dataset dataset = generate_dataset(trainingSize) X.append(dataset[0]) Y.append(dataset[1]) # Create and train a linear regressor on the dataset lr = LinearRegression() lr.fit(dataset[0], dataset[1]) lRegressions.append(lr) # Create and train a non-linear regressor on the dataset knn = KNeighborsRegressor(nNeighbors) knn.fit(dataset[0], dataset[1]) nonLRegression.append(knn) # Generate datasets for each x x = np.linspace(-4, 4, 100) datasetsX0 = [] for i in x: datasetsX0.append(generate_y_dataset(i, trainingSize)) # Residual error y = [] # Compute residual error for each x for data in datasetsX0: y.append(np.var(data))
split_one = dc_listings.iloc[0:1862].copy() split_two = dc_listings.iloc[1862:].copy() ## 2. Holdout Validation ## from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error train_one = split_one test_one = split_two train_two = split_two test_two = split_one # First half model = KNeighborsRegressor() model.fit(train_one[["accommodates"]], train_one["price"]) test_one["predicted_price"] = model.predict(test_one[["accommodates"]]) iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1 / 2) # Second half model.fit(train_two[["accommodates"]], train_two["price"]) test_two["predicted_price"] = model.predict(test_two[["accommodates"]]) iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1 / 2) avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse]) print(iteration_one_rmse, iteration_two_rmse, avg_rmse) ## 3. K-Fold Cross Validation ##