def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5): model = KNeighborsRegressor(n_neighbors=n_neighbors) model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) score = model.score(features_train, labels_train) print "KNeighbors - coefficient of determination R^2 of the prediction: ", score return labels_pred
def fill_income(df): income_imputer = KNeighborsRegressor(n_neighbors=2) df_w_monthly_income = df[df.monthly_income.isnull() == False].copy() df_w_null_monthly_income = df[df.monthly_income.isnull() == True].copy() cols = ["number_real_estate_loans_or_lines", "number_of_open_credit_lines_and_loans"] income_imputer.fit(df_w_monthly_income[cols], df_w_monthly_income.monthly_income) new_values = income_imputer.predict(df_w_null_monthly_income[cols]) df_w_null_monthly_income.loc[:, "monthly_income"] = new_values df2 = df_w_monthly_income.append(df_w_null_monthly_income) return df2
def main(featureFile, outputfolder): with open(featureFile, 'r') as csvfile: my_data = pd.read_csv(csvfile, delimiter="\t", low_memory=False) random_indices = permutation(my_data.index) # how many time do we want the data in our test set? test_cutoff = math.floor(len(my_data)/3) test = my_data # Generate the training set with the rest of the data. train = my_data.loc[random_indices[test_cutoff:]] x_columns = ["Row"=="1", "Student ID"=="2", "Problem Hierarchy" == "3", "Problem Name"=="4", "Problem View" == "5", "Step Name" == "6", "KC(Default)"=="7", "Opportunity (Default)" == "8"] x_columns = [int(i) for i in x_columns] # y columns show the predicted feature, in this case, the correct first attempt y_column = ["Correct First Attempt"] # Look at the Ten closest neighbors, to offset potential noise in the data knn = KNeighborsRegressor(n_neighbors=10) knn.fit(train[x_columns], train[y_column]) # Make point predictions on the test set using the fit model. predictions = knn.predict(test[x_columns]) actual = test[y_column] result = test[['Anon Student Id','Correct First Attempt']] result.to_csv(outputfolder, sep='\t') # Compute the root mean squared error of our predictions. rmse = math.sqrt((((predictions - actual) ** 2).sum()) / len(predictions)) print('RMSE=') print(rmse)
def knn_model(train, y_train, test): model = KNeighborsRegressor(n_neighbors = 10, weights='distance', n_jobs=-1) model.fit(train, y_train) test_probs = model.predict(test) indices = test_probs < 0 test_probs[indices] = 0 return test_probs
def run_network(mdl=None, data=None): global_start_time = time.time() sequence_length = 10 if data is None: print('Loading data... ') X_train, y_train, X_test, y_test = train_test_traffic_data(15773, sequence_length) else: X_train, y_train, X_test, y_test = data print('\nData Loaded...\n') if mdl is None: mdl = KNeighborsRegressor(5, weights='distance') try: mdl.fit(X_train, y_train) predicted_trffic = mdl.predict(X_test) except KeyboardInterrupt: print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, 0 print('Training duration (s) : ', time.time() - global_start_time) return mdl, y_test, predicted_trffic
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'n_neighbors': [2, 5, 10, 15]}] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): # pdb.set_trace() X_train, y_train = stock.get_data(start_date, mid_date, fit=True) X_cv, y_cv = stock.get_data(mid_date, end_date) lowest_mse = np.inf for i, param in enumerate(params): knn = KNeighborsRegressor(**param) # ada = AdaBoostRegressor(knn) knn.fit(X_train.values, y_train.values) mse = mean_squared_error(y_cv, knn.predict(X_cv.values)) if mse <= lowest_mse: self.models[ticker] = knn return self
def train(self, x, y, param_names, random_search=100, **kwargs): start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search n_neighbors = self._random_search(random_iter=100, x=scaled_x, y=y) # Now train model knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') knn.fit(scaled_x, y) self._model = knn duration = time.time() - start self._training_finished = True return duration
def calc_linear_regression(reg_training_path): dataset = read_reg_train_data(reg_training_path) rmse = 0 n_folds = 5 folds = KFold(n=len(dataset), n_folds=n_folds, shuffle=False) fold = 0 for train_indices, test_indices in folds: fold += 1 training_set = [dataset[i] for i in train_indices] test_set = [dataset[i] for i in test_indices] training_dataframe = get_data_frame(training_set) test_dataframe = get_data_frame(test_set) column_names = ['cf_item', 'cf_user', 'svd', 'content_item', 'actual_rating'] training_dataframe.columns = column_names test_dataframe.columns = column_names actual_rating_training_column = training_dataframe['actual_rating'] #actual_rating_test_column = test_dataframe['actual_rating'] training_dataframe = training_dataframe.drop('actual_rating', axis=1) test_dataframe = test_dataframe.drop('actual_rating', axis=1) neigh = KNeighborsRegressor(n_neighbors=10) #print('Initialized k nearest neighbors regressor with k =', i) neigh.fit(training_dataframe, actual_rating_training_column) #print('Fit data models') predict_set = neigh.predict(test_dataframe) print(predict_set) rmse += mean_squared_error([rec[4] for rec in test_set], [rec for rec in predict_set]) ** 0.5 print("Fold (%d) finished with accumulated RMSE of (%f) (%s)" % (fold, rmse, time.strftime('%y_%m_%d_%H_%M_%S'))) return rmse / float(n_folds)
def run_kNeighbors(distances, loadings, test_vars, weightings=('uniform',), k_list=(3)): """ Run Knearest neighbor using precomputed distances to create an ontological mapping Args: distances: square distance matrix to pass to KNeighborsRegressors loadings: loading matrix for training test_vars: variable to reconstruct weightings: (optional) list of weightings to pass to KNeighbors k_list: list of k values to pass to KNeighbors as n_neighbors """ train_distances = distances.loc[loadings.index, loadings.index] test_distances = distances.loc[test_vars, loadings.index] to_return = pd.DataFrame() for weighting in weightings: for k in k_list: clf = KNeighborsRegressor(metric='precomputed', n_neighbors=k, weights=weighting) clf.fit(train_distances, loadings) out = clf.predict(test_distances) out = pd.DataFrame(out, columns=loadings.columns) out['var'] = test_vars out['k'] = k out['weighting'] = weighting # add neighbors and distances neighbors = clf.kneighbors(test_distances) out['distances'] = tuple(neighbors[0]) out['neighbors'] = tuple(test_distances.columns[neighbors[1]]) to_return = pd.concat([to_return, out], sort=False) return to_return
def apply_knn(): regr = KNeighborsRegressor() regr.fit(Xtr, Ytr) pred = regr.predict(Xte) temp = mean_squared_error(Yte, pred) return pred, temp
def kNN(X_train, y_train, X_test, y_test, uselog=False): ''' :param X_train: :param y_train: :param X_test: :param y_test: :return: ''' scaler = StandardScaler() print X_train.shape print X_test.shape X = scaler.fit_transform(X_train) test = scaler.transform(X_test) clf = KNeighborsRegressor(n_neighbors=550) clf.fit(X, y_train) result = clf.predict(test) if uselog: result = map(lambda x: math.log(1 + x), result) return result
def transform(self, X, y=None): """ :param X: multidimensional numpy array like. """ rows, features = X.shape mask = list(map(lambda x: reduce(lambda h, t: h or t, x), np.isnan(X))) criteria_for_bad = np.where(mask)[0] criteria_for_good = np.where(mask == np.zeros(len(mask)))[0] X_bad = X[criteria_for_bad] X_good = X[criteria_for_good] knn = KNeighborsRegressor(n_neighbors=self.k) for idx, x_bad in zip(criteria_for_bad.tolist(), X_bad): missing = np.isnan(x_bad) bad_dim = np.where(missing)[0] good_dim = np.where(missing == False)[0] for d in bad_dim: x = X_good[:, good_dim] y = X_good[:, d] knn.fit(x, y) X[idx, d] = knn.predict(x_bad[good_dim]) return X
def smooth(self, X, y): # KNN algorithm for smooth nbrs = KNeighborsRegressor(n_neighbors = 20) X = X.reshape(-1, 1) nbrs.fit(X, y) proba = nbrs.predict(X) return proba
def k_nearest_neighbours(): filepath = "bondchanges.arff" all_data = arff_read_to_array(filepath) X_data = all_data["data"] Y_data = all_data["target"] Y_data_map = {} new_Y_data = np.array([]) i = 01 for index,data in enumerate(Y_data): data1 = data.split('_')[0] split_data = (".").join(data1.split('.')[:1]) if not split_data in Y_data_map: Y_data_map[split_data] = i i+=1 print split_data new_Y_data = np.append(new_Y_data,[Y_data_map[split_data]],0) #Create X_training = X_data[:0.9*len(X_data)] Y_training = new_Y_data[:0.9*len(Y_data)] print X_training print print Y_training X_test = X_data[0.9*len(X_data):] Y_test = new_Y_data[0.9*len(Y_data):] #svc = svm.SVC(C=1, kernel='') knn = KNeighborsClassifier() knnr= KNeighborsRegressor(n_neighbors=20000) print knnr.fit(X_training, Y_training).score(X_test,Y_test)
def Round2(X, y): # Set parameters min_score = {} for neigh in [5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000]: model = KNeighborsRegressor(n_neighbors=neigh) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['neighbor'] = neigh min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['neighbor'] = neigh min_score['scores'] = scores print "Neighbors:", neigh print scores print np.mean(scores) return min_score
def __init__(self,dataFrame): self.dataFrameKNN = {} self.KNNWeightage = {'Avg-High Ratio':100,'Avg-Low Ratio':100,'Deliverable Qty':300,'Turnover':100,'Growth':150,'Trend':100,'Output':100} self.valid = True self.KNNModelHash = {} self.dataFrameKNN = pd.DataFrame() self.dataFrameKNN['Avg-High Ratio'] = dataFrame['High Price'][1:] - dataFrame['Average Price'][1:] self.dataFrameKNN['Avg-Low Ratio'] = dataFrame['Average Price'][1:] - dataFrame['Low Price'][1:] self.dataFrameKNN['Deliverable Qty'] = dataFrame['Deliverable Qty'][1:] self.dataFrameKNN['Turnover'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Growth'] = dataFrame['Close Price'][1:]-dataFrame['Prev Close'][1:] self.dataFrameKNN['Trend'] = dataFrame['Turnover in Lacs'][1:] self.dataFrameKNN['Output'] = dataFrame['High Price'][1:]-dataFrame['Prev Close'][1:] self.KNNModelHash['mean'] = self.dataFrameKNN['Output'].mean() self.KNNModelHash['std'] = self.dataFrameKNN['Output'].std() for key in self.dataFrameKNN: self.normalizeKNNModel(key) #trainData has the data to be trained, but the last data is the testData trainData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][:-1].values testData = self.dataFrameKNN[['Avg-High Ratio','Avg-Low Ratio','Deliverable Qty','Growth']][-1:].values #trainOutput contains the output corresponding to train Data but the first one is garbage trainOutput = self.dataFrameKNN['Output'][1:].values KNNModel = KNeighborsRegressor(n_neighbors=3,weights = 'distance') KNNModel.fit(trainData[100:400],trainOutput[100:400]) prediction = KNNModel.predict(trainData[400:450]) weightage = self.KNNWeightage['Output'] for i in range(50): prediction[i] = ((prediction[i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage trainOutput[400+i] = ((trainOutput[400+i]*self.KNNModelHash['std'])+self.KNNModelHash['mean'])/weightage print "%-40s %-40s " %(prediction[i],trainOutput[400+i])
def calculateKNearestNeighborsModel(data, numberOfNeighbors): # Select input variables as x and typecast to numpy array x = np.array(data.iloc[0:,0:11]) # Select output variable (quality) as y and typecast to numpy array y = np.array(data.quality) neighbors = KNeighborsRegressor(n_neighbors=numberOfNeighbors) neighbors.fit(x, y) return neighbors
def predictDayType (self,week,day): knn = KNeighborsRegressor(n_neighbors=5) knn.fit(self.rawData, self.dayType) X = np.array([week,day]) predictions = knn.predict(X) return predictions
class ModelNNReg(ScikitPredictor): '''Nearest neighbor regression''' def generate_model(self): self.model = KNeighborsRegressor(**self.model_kwargs) def fit_model(self, x, y): self.model.fit(x, y)
def nnVerify_2(city_data,x,y): """ Using SKLearn's KNeighborsRegressor """ X,Y = city_data.data, city_data.target clf = KNeighborsRegressor(n_neighbors=2) clf.fit(X,Y) y_pred = clf.predict(x) print("KNeighborsRegressor") print("Y pred(KNN) : ", y_pred)
def main(): # read the images image_from = io.imread(name_from) / 256 image_to = io.imread(name_to) / 256 # change to hsv domain (if requested) if args.use_hsv: image_from[:] = rgb2hsv(image_from) image_to[:] = rgb2hsv(image_to) # get shapes shape_from = image_from.shape shape_to = image_to.shape # flatten X_from = im2mat(image_from) X_to = im2mat(image_to) # number of pixes n_pixels_from = X_from.shape[0] n_pixels_to = X_to.shape[0] # subsample X_from_ss = X_from[np.random.randint(0, n_pixels_from-1, n_pixels),:] X_to_ss = X_to[np.random.randint(0, n_pixels_to-1, n_pixels),:] if save_col_distribution: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('white') fig, axes = plt.subplots(nrows=2, figsize=(5, 10)) for ax, X in zip(axes, [X_from_ss, X_to_ss]): ax.scatter(X[:,0], X[:,1], color=X) if args.use_hsv: ax.set_xhsvel('hue') ax.set_yhsvel('value') else: ax.set_xhsvel('red') ax.set_yhsvel('green') axes[0].set_title('distr. from') axes[1].set_title('distr. to') fig.tight_layout() fig.savefig('color_distributions.png') # optimal tranportation ot_color = OptimalTransport(X_to_ss, X_from_ss, lam=lam, distance_metric=distance_metric) # model transfer transfer_model = KNeighborsRegressor(n_neighbors=n_neighbors) transfer_model.fit(X_to_ss, n_pixels * ot_color.P @ X_from_ss) X_transfered = transfer_model.predict(X_to) image_transferd = minmax(mat2im(X_transfered, shape_to)) if args.use_hsv: image_transferd[:] = hsv2rgb(image_transferd) io.imsave(name_out, image_transferd)
class Knn(ContextEngineBase): y_Test = np.empty([0]) # Knn object knnRegressor = None def __init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict): ContextEngineBase.__init__(self, numInputs, outputClassifier, inputClassifiers, appFieldsDict) # Passed parameters self.n_neighbors = appFieldsDict['n_neighbors'] self.weights = appFieldsDict['weights'] self.algorithm = appFieldsDict['algorithm'] self.n_jobs = appFieldsDict['n_jobs'] # Defining a Knn object with given parameters self.knnRegressor = KNeighborsRegressor(n_neighbors = self.n_neighbors, weights = self.weights, algorithm = self.algorithm, n_jobs = self.n_jobs) # Add a set of training observations, with the newInputObsMatrix being a # matrix of doubles, where the row magnitude must match the number of inputs, # and the column magnitude must match the number of observations. # and newOutputVector being a column vector of doubles def addBatchObservations(self, newInputObsMatrix, newOutputVector): if(len(newInputObsMatrix.shape) == 2 and newInputObsMatrix.shape[1] == self.numInputs and newOutputVector.shape[0] == newInputObsMatrix.shape[0]): # print("All good!") newOutputVector = newOutputVector.ravel() i = 0 for newInputVector in newInputObsMatrix: newOutputValue = newOutputVector[i] self.addSingleObservation(newInputVector, newOutputValue) i += 1 else: print("Wrong dimensions!") # Train the coefficients on the existing observation matrix if there are # enough observations. def train(self): if (self.numObservations > 0): # print("Training started") self.knnRegressor.fit(self.observationMatrix, self.outputVector) return True else: print("Not enough observations to train!") return False # Execute the trained matrix against the given input observation # inputObsVector is a row vector of doubles def execute(self, inputObsVector): if(len(inputObsVector) == self.numInputs): # print("Begin execute") #x_Test = np.vstack((self.x_Test,inputObsVector)) x_Test = np.reshape(inputObsVector,(1,self.numInputs)) self.y_Test = self.knnRegressor.predict(x_Test) return self.y_Test[0] else: print("Wrong dimensions, fail to execute") return None
def knn_regressor(features, solutions, verbose=0): columns = solutions.columns clf = KNeighborsRegressor(n_neighbors=5, weights='distance') print('Training Model... ') clf.fit(features, solutions) print('Done Training') return (clf, columns)
def impute_KNN(df,var,features,k,): var_imputer = KNeighborsRegressor(n_neighbors=k) df_full = df[df[var].isnull()==False] df_null = df[df[var].isnull()==True] var_imputer.fit(df_full[features], df_full[var]) impute = var_imputer.predict(df_null[features]) df_null[var] = impute df = df_full.append(df_null) return df
class kNN(): ''' kNN classifier ------------- ''' def __init__(self,N_i,N_o,k=5,n=20): # note: N_o=1 assumed for now self.N_i = N_i self.n = n self.i = 0 self.k = k self.X = zeros((self.n,N_i)) self.y = zeros((self.n)) self.h = KNeighborsRegressor(n_neighbors=k,weights='distance')#='distance') self.c = 0 #self.error_rate = 0 def predict(self,x): ''' Predict -------------- ''' if self.c < 1.: print "[Warning!] No training examples!" return 0.0 elif self.c <= self.k: dist,ind = self.h.kneighbors(self.X[0:self.c],n_neighbors=1) i_max = argmax(ind) return self.y[i_max] return self.h.predict(x)#.reshape(1,-1)) # def samples_X(self): # ''' return samples of the WEIGHTS ''' # if self.c <= 0: # return self.X[0,:] # return self.X[0:self.c,:] def update(self, x, y): ''' Update -------------- ''' self.X[self.i,:] = x self.y[self.i] = y #self.error_rate = (y - self.predict(x))**2 self.i = (self.i + 1) % self.n if self.c < self.n: self.c = self.c + 1 self.h.fit(self.X[0:self.c,:], self.y[0:self.c])
def addJKRegionLabels(self): data = zip(self.data['RA'],self.data['DEC']) randoms = zip(self.randoms['RA'],self.randoms['DEC']) finder = KMeans(n_clusters=self.config['n_jackknife']) self.data_jk_indices = finder.fit_predict(data) nbrs = KNeighborsRegressor(n_neighbors=1) nbrs.fit(data,self.data_jk_indices) self.random_jk_indices = nbrs.predict(randoms)
def nearest_neighbors_impute(df, coordinate_columns, data_columns, knr_params={}): from sklearn.neighbors import KNeighborsRegressor for column in data_columns: not_null = df[column].notnull() if (~not_null).sum() == 0: continue knr = KNeighborsRegressor(**knr_params) knr.fit(df.loc[not_null,coordinate_columns], df.loc[not_null,[column]]) predicted = knr.predict(df.loc[~not_null,coordinate_columns]) df.loc[ (~not_null),[column]] = predicted
def compute_mse(regressor, horizon): # get wind park and corresponding target. forecast is for the target # turbine park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 3, 2004, 2005) target = windpark.get_target() # use power mapping for pattern-label mapping. Feature window length # is 3 time steps and time horizon (forecast) is 3 time steps. feature_window = 3 mapping = PowerMapping() X = mapping.get_features_park(windpark, feature_window, horizon) Y = mapping.get_labels_turbine(target, feature_window, horizon) # train roughly for the year 2004. train_to = int(math.floor(len(X) * 0.5)) # test roughly for the year 2005. test_to = len(X) # train and test only every fifth pattern, for performance. train_step, test_step = 5, 5 if(regressor == 'linear'): # fitting the pattern-label pairs reg = linear_model.LinearRegression() reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step]) y_hat = reg.predict(X[train_to:test_to:test_step]) elif(regressor == 'knn'): k_neighbors = 10 reg = KNeighborsRegressor(k_neighbors, 'uniform') # fitting the pattern-label pairs reg = reg.fit(X[0:train_to:train_step], Y[0:train_to:train_step]) y_hat = reg.predict(X[train_to:test_to:test_step]) else: raise Exception("No regressor set.") # naive is also known as persistance model. naive_hat = zeros(len(y_hat), dtype = float32) for i in range(0, len(y_hat)): # naive label is the label as horizon time steps before. # we have to consider to use only the fifth label here, too. naive_hat[i] = Y[train_to + (i * test_step) - horizon] # computing the mean squared errors of Linear and naive prediction. mse_y_hat, mse_naive_hat = 0, 0 for i in range(0, len(y_hat)): y = Y[train_to + (i * test_step)] mse_y_hat += (y_hat[i] - y) ** 2 mse_naive_hat += (naive_hat[i] - y) ** 2 mse_y_hat /= float(len(y_hat)) mse_naive_hat /= float(len(y_hat)) return mse_y_hat, mse_naive_hat
def knn(X, Y): neigh = KNeighborsRegressor() neigh.fit(X, Y) def explore(x): score = -1 * neigh.predict([x]) return score minimized = differential_evolution(explore, ((0, 1), (0, 1), (0, 1), (0, 1), (0, 1))) return { 'X_min': list(minimized.x), 'score': neigh.score(X, Y) }
def kNN(X, Y=[], k=2, algorithm="brute", radius=0.65, filename='graph-output.pdf', do_regression=True ): graph = pydot.Dot(graph_type='digraph') knn_model = NearestNeighbors(n_neighbors=k, algorithm=algorithm) nbrs = knn_model.fit(X) print '-' * 80 indices = nbrs.kneighbors(X, 2, return_distance=False) radius_indices = nbrs.radius_neighbors(X, radius, return_distance=False) k_mapping = zip( X, indices, radius_indices ) print '-' * 80 nodes, misses, hits = {}, [], [] for i, kmap in enumerate(k_mapping): sample = kmap[0] nneigh = kmap[1] rneigh = kmap[2] ypred = Y[nneigh[1]] ytrue = Y[i] nodes[i] = pydot.Node(str(i)) if ypred != ytrue: misses.append(i) else: hits.append(i) for j in nneigh: if j not in nodes: nodes[j] = pydot.Node(str(j)) if i == j: color = "black" if ytrue != ypred: color = "red" label = "%s: %s" % ( ypred, ytrue ) graph.add_edge(pydot.Edge(str(i), str(j), label=label, labelfontcolor="#009933", fontsize="7.0", color=color)) if i != j: color = "black" if ypred != Y[j]: color = "red" label = "%s: %s" % ( ypred, Y[j] ) graph.add_edge(pydot.Edge(str(i), str(j), label=label, labelfontcolor="#009933", fontsize="7.0", color=color)) print "%s : %s... \n\t%s:%s \n\t%s \n\t%s" % ( i, sample[:10], ypred, ytrue, nneigh, rneigh ) print '-' * 80 graph.write_pdf(filename) print "[%s] misses: %s" % ( len(misses), misses ) print "[%s] hits: %s" % ( len(hits), hits ) if do_regression: neigh = KNeighborsRegressor(n_neighbors=2) neigh.fit(X, Y) yp = neigh.predict(X+X*random.random()*.1) m = len(yp) mse = sum([ (x-y)*(x-y) for (x,y) in zip( Y, yp ) if x-y ]) / float(m) print "REGRESSION mean squared error: ", mse print "REGRESSION mse(i)!=0: ", [ ("%s:" % i, "%.5f" % ((x-y)*(x-y)/float(m))) for (i,x,y) in zip( range(m), Y, yp ) if x-y ] return nbrs, yp
def k_nearest_neigbor(self, input, output): model = KNeighborsRegressor(n_neighbors=2, p=1) model.fit(input, output) return model
plt.ylabel('Actual Income') plt.title('Multiple Linear Regression Results') plt.show() lr_coefs = pd.DataFrame({'col':list(features.columns), 'coef':lr.coef_, 'abs_coef':abs(lr.coef_)}).sort_values(by='abs_coef', ascending=False).reset_index(drop=True) lr_coefs from sklearn.neighbors import KNeighborsRegressor k_opt = 0 max_score = 0 scores = [] print('Iteration: k = ', end='') for k in range(1,21): print('{}, '.format(k), end='', flush=True) knr = KNeighborsRegressor(n_neighbors=k, weights='distance') knr.fit(train_features, train_targets) score = knr.score(test_features, test_targets) scores.append(score) if abs(score) > abs(max_score): k_opt = k max_score = score fig, ax = plt.subplots(figsize=(10,7)) plt.plot(range(1,21), scores) plt.xlabel('n_neighbors (k)') plt.ylabel('R2 Score') plt.title('K Neighbors Regressor') ax.annotate('k={}, R2={}'.format(k_opt,max_score), (k_opt, scores[k_opt-1])) plt.show()
# In[11 ]: #Decision Tree Regressor reg = DecisionTreeRegressor().fit(X_train, y_train) y_train_pred = reg.predict(X_train) y_test_pred = reg.predict(X_test) tr_err = mean_squared_error(y_train_pred, y_train) ts_err = mean_squared_error(y_test_pred, y_test) print(tr_err, ts_err) # In[ 12]: #KNN reg = KNeighborsRegressor() params = {'kneighborsregressor__n_neighbors':[1, 5, 10, 20, 25]} pipe = make_pipeline(reg) grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5) reg = grid.fit(X_train, y_train) print('Best MSE: ', grid.best_score_) print('Best Parameters: ', grid.best_estimator_) y_train_pred = reg.predict(X_train) y_test_pred = reg.predict(X_test) tr_err = mean_squared_error(y_train_pred, y_train) ts_err = mean_squared_error(y_test_pred, y_test) print(tr_err, ts_err) # In[ 13]:
# read in csv files X_train = pd.read_csv('train.csv') y_train = pd.read_csv('y_train.csv', names=['price']) X_test = pd.read_csv('test.csv') y_test = pd.read_csv('y_test.csv', names=['price']) # some exploratory data analyses print(X_train.shape, y_train.shape) print(X_test.shape, y_test.shape) #print(X_train.info()) #print(X_train.describe()) #print(X_train.head()) # preprocess data X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) # fit Regressor to training data knn_reg = KNeighborsRegressor(n_neighbors=5) knn_reg.fit(X_train, y_train) # Make predictions y_pred = knn_reg.predict(X_test) # compute metrics regression_results(y_test, y_pred)
def stack_NN_model(X, y, nn_obj, n, mod, cb=cb, verbose=True, crit="mse", **kwargs): X = np.copy(X) y = np.copy(y) permute_indices = np.random.permutation(np.arange(len(y))) X = X[permute_indices] y = y[permute_indices] X = nn_obj.predict(X) # nn_obj.split_and_scale(X, y, scaler="Standard") xtr, xte, ytr, yte = train_test_split(X, y, random_state=0) if nn_obj.err_type == "AVL": crit = "mae" else: crit = "mse" if mod == "RF": try: model = RandomForestRegressor(n_estimators=n, max_depth=kwargs["max_depth"],\ criterion=crit) except KeyError: model = RandomForestRegressor(n_estimators=n, criterion=crit) else: model = KNeighborsRegressor(n_neighbors=n) model = model.fit(xtr, ytr.ravel()) print("{} with {}: Score = {}".format(mod, n, model.score(xte, yte.ravel()))) dev_lab = "{}_augmented_Pred_lr_{}_{}_{}_Maxepoch_{}"\ .format(mod, nn_obj.lr, nn_obj.train_mod, nn_obj.activation,\ nn_obj.scaler, nn_obj.max_epoch) test_line = range(len(xte)) mod_test_preds = model.predict(xte) deviation = np.array([abs(mod_test_preds[j] - yte[j]) for j in test_line]) error_estimation = sum(deviation) if plt.fignum_exists("Stacking Comparaison sur le test set"): plt.figure("Stacking Comparaison sur le test set") plt.plot(test_line, mod_test_preds, label=dev_lab, marker='+',\ fillstyle='none', linestyle='none', c=nn_obj.kwargs["color"]) else: plt.figure("Stacking Comparaison sur le test set") plt.plot(test_line, yte, label="Expected value", marker='o', fillstyle='none',\ linestyle='none', c='k') plt.plot(test_line, mod_test_preds, label=dev_lab, marker='+',\ fillstyle='none', linestyle='none', c=nn_obj.kwargs["color"]) plt.legend(loc="best", prop={'size': 7}) if plt.fignum_exists("Stacking Deviation of the prediction"): plt.figure("Stacking Deviation of the prediction") plt.plot(yte, mod_test_preds, c=nn_obj.kwargs["color"], marker='o',\ linestyle='none', label=dev_lab, ms=3) else: plt.figure("Stacking Deviation of the prediction") plt.plot(yte, yte, c='navy', marker='+', label="wanted value") plt.plot(yte, mod_test_preds, c=nn_obj.kwargs["color"], marker='o',\ linestyle='none', label=dev_lab, ms=3) plt.legend(loc="best", prop={'size': 7}) print("Résultat après Stacking NN_%s Estimateur ou Voisins = %d" % (mod, n)) # print("Modèle pour H_NL = {}, H_NN = {} \n".format(len(N_.keys())-2, N_["N1"])) print("Fonction d'activation : {}\n Fonction de cout : {}\n\ Méthode d'optimisation : {}".format(nn_obj.activation, nn_obj.err_type, nn_obj.train_mod)) print("Moyenne de la somme des écart sur le test set = {}\n".format( error_estimation)) plt.show() return model
best_radius = radii[np.argmin(mae_rnn)] fig, ax = plt.subplots() ax.set_title('Parameter evaluation for RNN') ax.set_xlabel('Radius') ax.set_ylabel('Mean absolute error') ax.set_xlim(low, high) ax.set_xticks(list(ax.get_xticks()) + [best_radius]) ax.plot(radii, mae_rnn, c='orange', linewidth=2) fig.savefig('rnn_param.png') return best_radius knn_regressor = KNeighborsRegressor(n_neighbors=get_best_knn_n_neighbors( 1, 100), weights='distance') knn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) rnn_regressor = RadiusNeighborsRegressor(radius=get_best_rnn_radius( 1.7, 3.0, 0.05), weights='distance') rnn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) lr_regressor = LinearRegression() lr_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) energia_knn = knn_regressor.predict(test_df[['temperatura', 'vacuo']]) energia_rnn = rnn_regressor.predict(test_df[['temperatura', 'vacuo']]) energia_lr = lr_regressor.predict(test_df[['temperatura', 'vacuo']])
# alg evaluation metrics # Practice using the Accuracy and LogLoss metrics on a classification problem. # Practice generating a confusion matrix and a classification report # Practice using RMSE and RSquared metrics on a regression problem. scoring = 'neg_log_loss' results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print("Logloss: %.3f (%.3f)" % (results.mean(), results.std())) # Spot-check linear algorithms on a dataset (e.g. linear regression, logistic regression and # linear discriminate analysis). # Spot-check some nonlinear algorithms on a dataset (e.g. KNN, SVM and CART). # Spot-check some sophisticated ensemble algorithms on a dataset (e.g. random forest and # stochastic gradient boosting). # KNN Regression model = KNeighborsRegressor() scoring = 'neg_mean_squared_error' results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print("nonlinear regression check " + str(results.mean())) # Spot-Check a Nonlinear Regression Algorithm. # how to check more models at once # prepare models models = [] models.append(('LR', LogisticRegression())) # ?? models.append(('LDA', LinearDiscriminantAnalysis())) # ?? # evaluate each model in turn results = [] names = [] scoring = 'accuracy' for name, model in models:
nan_flag=[], zero=[ 'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis' ]) }, 'score': 5.5088106991425985, 'std': 0.293662905734789 }, 'KNeighborsRegressor': { 'params': { 'predictor': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=7, p=2, weights='uniform'), 'scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True, with_scaling=True), 'simple_imputer': FillNaTransformer(from_dict={}, mean=[ 'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis' ], median=[],
def train(self): if self.config['method'] == 'regression': print('Building regression model') print('Fetching data') self.get_df_reg() print('Data Fetched') print('Splitting data') df_x = self.df_reg.iloc[:, 3:] df_y = self.df_reg.iloc[:, 1] x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1) print('Data splitted') print('Size of x_train', x_train.shape) print('Size of y_train', y_train.shape) print('Size of x_test', x_test.shape) print('Size of y_test', y_test.shape) if self.config['model'] == 'svr': print('Support vector regressor') model = SVR(kernel=self.config['svr_kernel']) if self.config['model'] == 'knr': print('K-nearest neighbors regressor') model = KNeighborsRegressor(n_jobs=12) if self.config['model'] == 'dtr': print('Decision tree regressor') model = DecisionTreeRegressor() if self.config['model'] == 'rf': print('Random forest regressor') model = RandomForestRegressor(n_jobs=12) if self.config['model'] == 'et': print('Extra trees regressor') model = ExtraTreesRegressor(n_jobs=12) if self.config['model'] == 'gbr': print('Gradient boosting regressor') model = GradientBoostingRegressor() try: model except BaseException: print('Invalid model configuration. Check config.ini') return model.fit(x_train, y_train) pred = pd.Series(model.predict(df_x)) self.df_reg.insert(2, 'Predicted_current', pred) print('R^2 score', model.score(x_test, y_test)) print('Converting to binary classification') y_test_list, y_pred_list, _, _ = self.to_bin_cl( x_test, y_test, model) _, _, bin_y, bin_y_pred = self.to_bin_cl(df_x, df_y, model) conf_mat = confusion_matrix(y_true=y_test_list, y_pred=y_pred_list) print('Converted to binary classification') self.df_reg.insert(3, 'Actual_class', bin_y) self.df_reg.insert(4, 'Predicted_class', bin_y_pred) print('Confusion matrix:\n', conf_mat) p = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[1][0]) r = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[0][1]) print( 'Accuracy is', np.sum(np.array(y_test_list) == y_pred_list) / len(y_pred_list)) print('Precision is', p) print('Recall is', r) print('F1-score is', self.get_f_score(p, r, 1)) print('F0.5-score is', self.get_f_score(p, r, 0.5)) print('F2-score is', self.get_f_score(p, r, 2)) # joblib.dump(model,'models/'+self.config['model']+'.model') self.save_result()
def fit(self, X_train, y_train, X_val, y_val): model = KNeighborsRegressor(n_neighbors=35) model.fit(X_train, y_train) self.model = model
corr_matrix = df.corr() corr_matrix['MEDV'] sns.heatmap(corr_matrix); plt.show() print(boston['DESCR']) dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']] X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42) y_train = y_train.values.ravel() models = [] models.append(('SVR', SVR())) models.append(('KNN', KNeighborsRegressor())) models.append(('DT', DecisionTreeRegressor())) models.append(('RF', RandomForestRegressor())) models.append(('l', Lasso())) models.append(('EN', ElasticNet())) models.append(('R', Ridge())) models.append(('BR', BayesianRidge())) models.append(('GBR', GradientBoostingRegressor())) models.append(('RF', AdaBoostRegressor())) models.append(('ET', ExtraTreesRegressor())) models.append(('BgR', BaggingRegressor())) scoring = 'neg_mean_squared_error' results = [] names = []
from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.354881802745745 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=92), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet",
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None): '''Select estimator and parameters from argument name.''' # Regressors if estimator == 'RandomForestRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = RandomForestRegressor(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingRegressor': param_dist = parameters['ensemble'] estimator = GradientBoostingRegressor(n_estimators=n_estimators, random_state=random_state) elif estimator == 'SVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='rbf') elif estimator == 'LinearSVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='linear') elif estimator == 'Ridge': param_dist = parameters['linear'] estimator = Ridge(solver='auto', random_state=random_state) elif estimator == 'Lasso': param_dist = parameters['linear'] estimator = Lasso(random_state=random_state) elif estimator == 'ElasticNet': param_dist = parameters['linear'] estimator = ElasticNet(random_state=random_state) elif estimator == 'KNeighborsRegressor': param_dist = parameters['kneighbors'] estimator = KNeighborsRegressor(algorithm='auto') # Classifiers elif estimator == 'RandomForestClassifier': param_dist = { **parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion'] } estimator = RandomForestClassifier(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesClassifier': param_dist = { **parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion'] } estimator = ExtraTreesClassifier(n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingClassifier': param_dist = parameters['ensemble'] estimator = GradientBoostingClassifier(n_estimators=n_estimators, random_state=random_state) elif estimator == 'LinearSVC': param_dist = parameters['linear_svm'] estimator = LinearSVC(random_state=random_state) elif estimator == 'SVC': param_dist = parameters['svm'] estimator = SVC(kernel='rbf', random_state=random_state) elif estimator == 'KNeighborsClassifier': param_dist = parameters['kneighbors'] estimator = KNeighborsClassifier(algorithm='auto') return param_dist, estimator
for i in range(0, 32): data['file_name'] = data['file_name'].replace('File_' + str(i), i) data.drop('time', 1, inplace=True) #================================================================= # using cross val predict function Feature = data[['week', 'day_of_week', 'start_time', 'work_flow', 'file_name']] Result = data['size'] #=========using the best parameter we found # using best parameter, RMSE is: 0.0129735729554 knn = KNeighborsRegressor(n_neighbors=2, p=1, weights='distance') knn.fit(Feature, Result) predicted_target = cross_val_predict(knn, Feature, Result, cv=10) print 'using best parameter, RMSE is: ' print sp.sqrt(mean_squared_error(predicted_target, Result)) fig, ax = plt.subplots() ax.scatter(range(0, len(Result)), Result, c='b', s=8, label='true value') ax.scatter(range(0, len(predicted_target)), predicted_target, c='r', s=8, label='fitted value')
row.DecisionTreeMSE = metrics.mean_squared_error(y_test, y_pred) ## Random Forest Tree regr = RandomForestRegressor(max_depth=2) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) row.RandomForestMSE = metrics.mean_squared_error(y_test, y_pred) ### Boosting params = {'n_estimators': 100, 'max_depth': 2} clf = GradientBoostingRegressor(**params) clf.fit(X_train, y_train) row.BoostingMSE = metrics.mean_squared_error(y_test, y_pred) ### KNN neigh = KNeighborsRegressor(n_neighbors=3) neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test) row.KNeighbourMSE = metrics.mean_squared_error(y_test, y_pred) ### SVR svr = SVR(gamma='auto') svr = svr.fit(X_train, y_train.values.ravel()) y_pred = svr.predict(X_test) row.SVR_MSE = metrics.mean_squared_error(y_test, y_pred) result = result.append(row.toDict(), ignore_index=True) result # %% [markdown]
# MAGIC %md Read the dataset using the `fetch_california_housing` function and then split it into train and test using the `train_test_split` function. # COMMAND ---------- dataset = fetch_california_housing() X_full, y_full = dataset.data, dataset.target X_train, X_test, y_train, y_test=train_test_split(X_full,y_full,test_size=0.2, random_state=20) # COMMAND ---------- # MAGIC %md Here we use a k-nearest neighbors regressor as part of a pipeline that includes scaling, and for the purposes of comparison, a knn regressor trained on the unscaled data has been provided in the following code cell. # COMMAND ---------- steps=[('scaler', StandardScaler()), ('knn', KNeighborsRegressor())] pipeline=Pipeline(steps) # COMMAND ---------- # MAGIC %md Fit the pipeline using `X_train` as training data and `y_train` as target values, and pass the computed parameters to an object `knn_scaled`. Also, fit a knn regressor using unscaled training data and pass the computed parameters to the object `knn_unscaled`. # COMMAND ---------- knn_scaled = pipeline.fit(X_train, y_train) knn_unscaled = KNeighborsRegressor().fit(X_train, y_train) # COMMAND ---------- # MAGIC %md Compute and print metrics.
n_faces = 5 rng = check_random_state(4) face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] n_pixels = data.shape[1] X_train = train[:, :np.ceil(0.5 * n_pixels)] # Upper half of the faces y_train = train[:, np.floor(0.5 * n_pixels):] # Lower half of the faces X_test = test[:, :np.ceil(0.5 * n_pixels)] y_test = test[:, np.floor(0.5 * n_pixels):] # Fit estimators ESTIMATORS = { "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0), "K-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), } y_test_predict = dict() for name, estimator in ESTIMATORS.items(): estimator.fit(X_train, y_train) y_test_predict[name] = estimator.predict(X_test) # Plot the completed faces image_shape = (64, 64) n_cols = 1 + len(ESTIMATORS) plt.figure(figsize=(2. * n_cols, 2.26 * n_faces)) plt.suptitle("Face completion with multi-output estimators", size=16)
def nnr(datapath): # load mat datafile = os.path.join(datapath, 'data_numpy.mat') if os.path.exists(datafile) is False: print('Data file %s not found.' % datafile) data_numpy = sio.loadmat(datafile) # get training and test data train_x_raw = data_numpy['trainX_raw'] train_x_smooth = data_numpy['trainX_smooth'] train_y = data_numpy['trainY'] test_x_raw = data_numpy['testX_raw'] test_x_smooth = data_numpy['testX_smooth'] test_y = data_numpy['testY'] base_y = data_numpy['baseY'] train_y = train_y.ravel() t_start = time.perf_counter() x_fft = np.fft.fft(train_x_raw) raw_fft_time = time.perf_counter() - t_start train_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) x_fft = np.fft.fft(test_x_raw) test_x_raw_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) t_start = time.perf_counter() x_fft = np.fft.fft(train_x_smooth) smooth_fft_time = time.perf_counter() - t_start train_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) x_fft = np.fft.fft(test_x_smooth) test_x_smooth_fft = np.concatenate((np.imag(x_fft), np.real(x_fft)), axis=1) # NNR on raw data stream neighbor_num = 10 nnr_raw = KNeighborsRegressor(n_neighbors=neighbor_num, weights='distance') t_start = time.perf_counter() nnr_raw.fit(train_x_raw, train_y) nnr_raw_time = time.perf_counter() - t_start pred_y = nnr_raw.predict(test_x_raw) np.savetxt(os.path.join(datapath, 'nnr_raw.txt'), pred_y) nnr_raw_fft = KNeighborsRegressor(n_neighbors=neighbor_num, weights='distance') t_start = time.perf_counter() nnr_raw_fft.fit(train_x_raw_fft, train_y) nnr_raw_fft_time = time.perf_counter() - t_start pred_y = nnr_raw_fft.predict(test_x_raw_fft) np.savetxt(os.path.join(datapath, 'nnr_raw_fft.txt'), pred_y) nnr_smooth = KNeighborsRegressor(n_neighbors=neighbor_num, weights='distance') t_start = time.perf_counter() nnr_smooth.fit(train_x_smooth, train_y) nnr_smooth_time = time.perf_counter() - t_start pred_y = nnr_smooth.predict(test_x_smooth) np.savetxt(os.path.join(datapath, 'nnr_smooth.txt'), pred_y) nnr_smooth_fft = KNeighborsRegressor(n_neighbors=neighbor_num, weights='distance') t_start = time.perf_counter() nnr_smooth_fft.fit(train_x_smooth_fft, train_y) nnr_smooth_fft_time = time.perf_counter() - t_start pred_y = nnr_smooth_fft.predict(test_x_smooth_fft) np.savetxt(os.path.join(datapath, 'nnr_smooth_fft.txt'), pred_y) f_time = open(os.path.join(datapath, 'nnr_time.txt'), 'w') f_time.write(str(raw_fft_time) + '\n') f_time.write(str(smooth_fft_time) + '\n') f_time.write(str(nnr_raw_time) + '\n') f_time.write(str(nnr_raw_fft_time) + '\n') f_time.write(str(nnr_smooth_time) + '\n') f_time.write(str(nnr_smooth_fft_time) + '\n') f_time.close()
#prepare ypred for writing out to a file #yprep_pd = pd.DataFrame(ypred) #ypred_pd.columns = gg #ypred_pd.index = test_ids #ypred_frame_svr = pd.concat([ypred_frame_svr, ypred_pd], axis=1, sort=True) #K Nearest Neighbour gnlist = list(knn_grid['Gene_Name']) f = gene_name in gnlist if f != False: #Just to be sure that the gene exist in the KNN best grid dataframe k = knn_grid[knn_grid['Gene_Name']==gene_name].iloc[0,3] weight = knn_grid[knn_grid['Gene_Name']==gene_name].iloc[0,4] knn = KNeighborsRegressor(n_neighbors=k, weights = weight) knn.fit(cis_gt, adj_exp.ravel()) ypred = knn.predict(test_cis_gt) #write out ypred quickly open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+"_knn.txt", "a").write("\n") open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+"_knn.txt", "a").write(str(gene)) for j in range(len(ypred)): open(output+trn_pop+"_2_"+tst_pop.upper()+"_chr"+chrom+"_chunk"+chunk+ "_knn.txt", "a").write("\t"+str(ypred[j])) #prepare ypred for writing out to a file #yprep_pd = pd.DataFrame(ypred) #ypred_pd.columns = gg #ypred_pd.index = test_ids
"Dataset3/4/(b)/Neural", verbose=False) if RMSE_test_cur < RMSE_test_best: RMSE_test_best = RMSE_test_cur a_best = a h_best = h print() model_neural = MLPRegressor((h_best, ), activation=a_best) print_performance_log(model_neural, X_1b, y, "Dataset3/4/(b)/Neural") print("Best activation: {}".format(a_best)) print("Best number of hidden layers: {}".format(h_best)) neighbors = range(1, 51) n_best = None RMSE_test_best = float('inf') for i, n in enumerate(neighbors): progressBar(i + 1, len(neighbors)) model_knn = KNeighborsRegressor(n_neighbors=n) RMSE_test_cur = print_performance(model_knn, X_1b, y, "Dataset3/4/(b)/KNN", verbose=False) if RMSE_test_cur < RMSE_test_best: RMSE_test_best = RMSE_test_cur n_best = n print() model_knn = KNeighborsRegressor(n_neighbors=n_best) print_performance(model_knn, X_1b, y, "Dataset3/4/(b)/KNN") print("Best number of neighbors: {}".format(n_best))
'neighbourhood_group', 'neighbourhood', 'room_type', 'availability_365', 'vacancy' ], axis=1) df = df[np.abs(df.price - df.price.mean()) <= (3 * df.price.std())] x = df.drop('price', axis=1) y = df.price x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10) #%% applying regression models # define models lasso = Lasso(alpha=0.5) knn1 = KNeighborsRegressor() lr = LinearRegression() svr = SVR(kernel='linear', gamma='auto') # fiting models lasso.fit(x_train, y_train) svr.fit(x_train, y_train) lr.fit(x_train, y_train) knn1.fit(x_train, y_train) #%% model evaluation mae_ln = mean_absolute_error(y_train, lr.predict(x_train)) mae_knn1 = mean_absolute_error(y_train, knn1.predict(x_train)) mae_svr = mean_absolute_error(y_train, svr.predict(x_train)) mae_lasso = mean_absolute_error(y_train, lasso.predict(x_train)) print(" training mae = ", mae_ln, mae_lasso, mae_svr, mae_knn1) #%% on testing data
# z = np.polyfit(x, y, deg=3) # p = np.poly1d(z) # ## Plot # xp = np.linspace(-2, 6, 100) # plt.figure(figsize=(6.5,4)) # plt.plot(x,y,'o',label='data') # plt.plot(xp, p(xp),label='polyfit') # plt.show() #X = [[0], [1], [2], [3]] #y = [0, 0, 1, 1] import numpy as np from sklearn.neighbors import KNeighborsRegressor import matplotlib.pyplot as plt import scipy x = np.array([[0.0], [1.0], [2.0], [3.0], [4.0], [5.0]]) y = np.array([0.0, 0.8, 0.9, 0.1, -0.8, -1.0]) neigh = KNeighborsRegressor(n_neighbors=3) neigh.fit(x, y) print(neigh.predict([[3.5]])) ## Plot xp = np.linspace(-2, 6, 100).reshape(-1, 1) plt.figure(figsize=(6.5, 4)) plt.plot(x, y, 'o', label='data') plt.plot(xp, neigh.predict(xp), label='nearest neighbor') #plt.plot(xp, p(xp),label='polyfit') plt.show()
# Set the clf to the best combination of parameters SVM_model_best = Random_obj.best_estimator_ # Fit the best algorithm to the data. SVM_model_best.fit(X_train, Y_train) SVM_model_score = cross_val_score(estimator = SVM_model_best, X = X_train, y = Y_train, cv = 10, scoring='neg_mean_squared_error') SVM_model_score = (np.sqrt(np.abs(SVM_model_score))) SVM_model_score_mean = SVM_model_score.mean() SVM_model_score_std = SVM_model_score.std() #.6.KNN KNN_model=KNeighborsRegressor() KNN_model.fit(X_train, Y_train) KNN_model_score = cross_val_score(estimator = KNN_model, X = X_train, y = Y_train, cv = 10, scoring='neg_mean_squared_error') KNN_model_score = (np.sqrt(np.abs(KNN_model_score))) KNN_model_score_mean = KNN_model_score.mean() KNN_model_score_std = KNN_model_score.std() # Choose some parameter combinations to try parameters = { 'n_neighbors': np.arange(1, 31, 1), 'metric': ["minkowski"] }
n_estimators=500, random_state=12, **{ 'max_depth': 5, 'num_leaves': 60, 'feature_fraction': '0.8', 'bagging_fraction': '0.92' }) # level0_models['KNN_rougher_a'] = make_pipeline(scaler,KNeighborsRegressor(n_jobs = -1,**{'n_neighbors': 254, 'weights': 'distance', 'leaf_size': 16})) scaler = make_pipeline(QuantileTransformer(output_distribution='normal'), PCA(whiten=True)) level0_models_rougher['KNN_rougher_b'] = make_pipeline( scaler, KNeighborsRegressor(n_jobs=-1, **{ 'n_neighbors': 50, 'weights': 'distance', 'leaf_size': 18 })) level0_models_rougher['KNN_rougher_c'] = make_pipeline( scaler, KNeighborsRegressor(n_jobs=-1, **{ 'n_neighbors': 15, 'weights': 'distance', 'leaf_size': 30.0 })) level0_models_rougher['KNN_rougher_d'] = make_pipeline( scaler, KNeighborsRegressor(n_jobs=-1, **{ 'n_neighbors': 5,
from sklearn.neighbors import KNeighborsRegressor from src.config import PP_DICT, FULL_DATA_DICT from src.model.utils import train_test_model params = {'n_neighbors': 46, 'p': 1, 'weights': 'distance'} pp_dict = PP_DICT data_dict = FULL_DATA_DICT pipeline, m_err, r2 = train_test_model(KNeighborsRegressor(), params, data_dict, pp_dict, save_model=True)
# same result with .iloc and .loc dc_listings = dc_listings.iloc[numpy.random.permutation(len(dc_listings))] split_one = dc_listings[0:1862] split_two = dc_listings[1862:] ## 2. Holdout Validation ## from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error train_one = split_one test_one = split_two train_two = split_two test_two = split_one knn = KNeighborsRegressor(n_neighbors=5, algorithm='auto') ## here X must be of shape [n_samples, n_features] ## train_one['accommodates'] is a Series, and has the wierd shape [1862, ] ## train_one[['accommodates']] is a DataFrame, and has the correct shape [1862, 1] knn.fit(train_one[['accommodates']], train_one['price']) prediction = knn.predict(test_one[['accommodates']]) iteration_one_mse = mean_squared_error(prediction, test_one['price']) iteration_one_rmse = iteration_one_mse**(1 / 2) knn.fit(train_two[['accommodates']], train_two['price']) prediction = knn.predict(test_two[['accommodates']]) iteration_two_mse = mean_squared_error(prediction, test_two['price']) iteration_two_rmse = iteration_two_mse**(1 / 2) avg_rmse = numpy.mean([iteration_one_rmse, iteration_two_rmse])
validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric num_folds = 10 seed = 7 scoring = 'neg_mean_squared_error' # Spot - Check Algorithm models = [] models.append(('LR', LinearRegression())) models.append(('LASSO', Lasso())) models.append(('EN', ElasticNet())) models.append(('KNN', KNeighborsRegressor())) models.append(('CART', DecisionTreeRegressor())) models.append(('SVR', SVR())) # Evaluate each model in turn results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name)
best_leaf_size = gs.best_params_['leaf_size'] best_weights = gs.best_params_['weights'] best_p = gs.best_params_['p'] outF = open("output.txt", "w") print('best_algorithm = ', best_algorithm, file=outF) print('best_n_neighbors = ', best_n_neighbors, file=outF) print('best_leaf_size = ', best_leaf_size, file=outF) print('best_weights = ', best_weights, file=outF) print('best_p = ', best_p, file=outF) print('R2 score is {}'.format(test_score_r2)) outF.close() kn = KNeighborsRegressor(n_neighbors=best_n_neighbors, algorithm=best_algorithm, leaf_size=best_leaf_size, weights=best_weights, p=best_p) t0 = time.time() kn.fit(x_train, y_train.ravel()) kn_fit = time.time() - t0 print("kNN complexity and bandwidth selected and model fitted in %.6f s" % kn_fit) t0 = time.time() y_kn = kn.predict(x_test) kn_predict = time.time() - t0 print("kNN prediction for %d inputs in %.6f s" % (x_test.shape[0], kn_predict)) # open a file to append
face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] n_pixels = data.shape[1] X_train = train[:, :int(np.ceil(0.5 * n_pixels))] # Upper half of the faces y_train = train[:, int(np.floor(0.5 * n_pixels)):] # Lower half of the faces X_test = test[:, :int(np.ceil(0.5 * n_pixels))] y_test = test[:, int(np.floor(0.5 * n_pixels)):] # Fit estimators ESTIMATORS = { "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0), "K-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), "Lasso": Lasso(), # "ElasticNet_0.5": ElasticNet(alpha=100000, l1_ratio=0.001), # "ElasticNet_0.1" : ElasticNet(alpha=0.0001, l1_ratio=0.01), } y_test_predict = dict() r2_scores = dict()
from numpy import load, save, zeros, nan_to_num from sklearn.neighbors import KNeighborsRegressor from open3d import read_point_cloud vox1 = read_point_cloud('vox1.ply') neigh = KNeighborsRegressor(1, n_jobs=-1) pca = nan_to_num(load('pca_1.npy')) pca_concat = zeros((len(pca), 9)) pca_concat[:, :3] = pca vox = read_point_cloud('vox2.ply') pca = nan_to_num(load('pca_2.npy')) neigh.fit(vox.points, pca) pca_concat[:, 3:6] = neigh.predict(vox1.points) vox = read_point_cloud('vox4.ply') pca = nan_to_num(load('pca_4.npy')) neigh.fit(vox.points, pca) pca_concat[:, 6:] = neigh.predict(vox1.points) save('pca', pca_concat)