def fit_KNeighbors(features_train, labels_train, features_pred, n_neighbors=5): model = KNeighborsRegressor(n_neighbors=n_neighbors) model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) score = model.score(features_train, labels_train) print "KNeighbors - coefficient of determination R^2 of the prediction: ", score return labels_pred
def knn(X, Y): neigh = KNeighborsRegressor() neigh.fit(X, Y) def explore(x): score = -1 * neigh.predict([x]) return score minimized = differential_evolution(explore, ((0, 1), (0, 1), (0, 1), (0, 1), (0, 1))) return { 'X_min': list(minimized.x), 'score': neigh.score(X, Y) }
class PriceModel(object): """Linear Regression Model used to predict future prices""" def __init__(self, algorithm='linear_regression'): if algorithm == 'knn': self.clf = KNeighborsRegressor(n_neighbors=2) else: self.clf = linear_model.LinearRegression() def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, x): return self.clf.predict(x) def score(self, X_test, y_test): return self.clf.score(X_test, y_test)
def _random_search(self, random_iter, x, y): # Default Values n_neighbors = 5 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) n_list = [1, ] while n_list[-1]*2 < x.shape[0]/2: n_list.append(n_list[-1]*2) n_list.extend(range(1,11)) param_dist = {"n_neighbors": n_list} param_list = [{"n_neighbors": n_neighbors}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) for idx, d in enumerate(param_list): knn = KNeighborsRegressor(n_neighbors=int(d["n_neighbors"]), weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) knn.fit(train_x, train_y) #print test_x.shape, test_y.shape, d sc = knn.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc n_neighbors = d['n_neighbors'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using n_neighbors: %d\n" % n_neighbors) return n_neighbors
X = loadtxt(train_file, usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), unpack=True, delimiter=',').T Y = loadtxt(train_file, unpack=True, usecols=(11), delimiter=',') X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) print('Data loaded!') plot_simple_table(X_train.T[:, :30]) best_params = find_best_params(X_train, y_train) model = KNeighborsRegressor(n_neighbors = best_params['n_neighbors']) model.fit(X_train, y_train) score = model.score(X_val, y_val) preds = model.predict(X_test) pred = preds.reshape(len(preds)) real = y_test plot_table(real, pred) plot_scatter(X_train, y_train, X_val, y_val, X_test, y_test, preds) # Compute the mean squared error of our predictions. mse = (((pred - real) ** 2).sum()) / len(pred) print('Cross-Val Score:', score) print('Mean Squared Error:', mse) plot_hm(real, pred)
# get importance importance_k = results.importances_mean # summarize feature importance for i, v in enumerate(importance_k): print('Feature: %0d, -> %s Score: %.5f' % (i, carrear_feature_names[i], v)) # plot feature importance plt.bar([x for x in range(len(importance_k))], importance_k) pyplot.xticks(np.arange(len(carrear_feature_names)), carrear_feature_names, rotation='vertical') plt.show() # In[33]: print('Accuracy of knn classifier on training set: {:.2f}'.format( knn.score(X_train, y_train))) print('Accuracy of knn classifier on test set: {:.2f}'.format( knn.score(X_test, y_test))) # CART carrear features # In[34]: from sklearn.tree import DecisionTreeRegressor # from matplotlib import pyplot # define the model modelcart = DecisionTreeRegressor() # fit the model modelcart.fit(X_train, y_train) # get importance
# df_avg == 3 year rolling average + yr4 stats X,y = df_avg[['all_avg']].values, df_avg['all_yr'].values X,y = df_avg[['all_prev']].values, df_avg['all_yr'].values X,y = df_avg[['all_avg', 'all_prev']].values, df_avg['all_yr'].values X,y = df_avg[['1D_avg', '2D_avg', '3D_avg', 'all_avg','1D_prev', '2D_prev', '3D_prev', 'all_prev']].values, df_avg['all_yr'].values X_train, X_test, y_train, y_test = tts(X, y) lin = LR(fit_intercept=False) lin.fit(X,y) lin.score(X,y) knn = KNR(n_neighbors=5) knn.fit(X_train,y_train) print knn.score(X_train,y_train) print knn.score(X_test,y_test) ns = range(1,30,2) scores = [] for n in ns: knn = KNR(n_neighbors=n) knn.fit(X_train,y_train) scores.append(knn.score(X_train,y_train)) rf = RFR(n_estimators = 50) rf.fit(X_train, y_train) rf.score(X_train, y_train)
def Regression(self, columns, predictName, portion=0.5, **argv): if type(columns) != list: raise Exception("First parameter must be a list.") if type(predictName) != str: raise Exception("Second parameter must be a string.") unknown_columns = set(columns)-set(self.keys()) if unknown_columns != set([]): raise Exception("Invalid columns: " + str(unknown_columns)) option = {} if argv.get('method') is None: method = 'LinearRegression' # perform function elif argv.get('method') == "LinearRegression": method = "LinearRegression" elif argv.get('method') == "RandomForest": method = "RandomForest" elif argv.get('method') == "SVM": method = "SVM" elif argv.get('method') == "KNeighbors": method = "KNeighbors" else: raise Exception("Unknown regression method: " + argv.get('method')) # specify portion if argv.get('portion') is None: portion = 0.5 else: portion = argv.get('portion') ## Select data ## need to fix the problem with NaN or infinate value x_rows = self.get(columns) # x_rows = Imputer().fit_transform(x_rows) y_rows = self.get(predictName) # y_rows = Imputer().fit_transform(y_rows) xRowNum = 0 yRowNum = 0 for x in x_rows.index: if numpy.any(numpy.isnan(x_rows.loc[x])): print("Dropped: ", x, x_rows.loc[x]) x_rows.drop(x_rows.loc[x]) y_rows.drop(y_rows.loc[x]) if (xRowNum < yRowNum): y_rows = y_rows[:xRowNum-2] if (xRowNum > yRowNum): x_rows = x_rows[:yRowNum-2] # get portion if portion > 1: portion = 1 elif portion < 0: portion = 0.5 # specify neighbor if argv.get('n_neighbors') is None: n_neighbors = 3 else: n_neighbors = argv.get('n_neighbors') # round x_train, x_test, y_train, y_test = train_test_split(x_rows, y_rows, test_size = 1-portion) # perform function if argv.get('method') == "LinearRegression": lin = lm.LinearRegression() lin.fit(x_rows, y_rows) self['linearPredict'] = "" for index in x_test.index: self['linearPredict'].loc[index] = lin.predict(x_test.loc[index]) print ("Method: ", method, "\tCoefficients: ", lin.coef_, "\tVariance score: %.2f", lin.score(x_test, y_test)) # perform function elif argv.get('method') == "RandomForest": ran = RandomForestRegressor() ran.fit(x_train, y_train) self['RandomForestClassifier'] = "" for index in x_test.index: self['RandomForestClassifier'].loc[index] = ran.predict(x_test.loc[index]) print ("Method: ", method, "\t Score: ", ran.score(x_test, y_test)) elif argv.get('method') == "SVM": sssvm = svm.SVR() sssvm.fit(x_train, y_train) self['svmPredict'] = "" for index in x_test.index: self['svmPredict'].loc[index] = sssvm.predict(x_test.loc[index]) print ("Method: ", method, "\t Score: ", sssvm.score(x_test, y_test)) elif argv.get('method') == "KNeighbors": knn = KNeighborsRegressor(n_neighbors) knn.fit(x_train, y_train) self['KNeighborsPredict'] = "" for index in x_test.index: self['KNeighborsPredict'].loc[index] = knn.predict(x_test.loc[index]) print ("Method: ", method, "\t Score: ", knn.score(x_test, y_test))
#Using mglearn library and one nearest neighbor # Generate Dataset import mglearn as mglearn from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split X, y = mglearn.datasets.make_wave(n_samples=40) # split the wave dataset into training and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) #instantiate the model and set the number of neighbors to consider to 3 reg = KNeighborsRegressor(n_neighbors=3) #fit the model using the training data and training targets reg.fit(X_train, y_train) print("Test set predictions: \n{}".format(reg.predict(X_test))) print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test))) # Generate Dataset import mglearn as mglearn from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split X, y = mglearn.datasets.make_wave(n_samples=40) # split the wave dataset into training and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) import matplotlib.pyplot as plt import numpy as np from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1,000 data points, evenly spaced between -3 and 3
clfpoly3.fit(X_train, y_train) ##KNN (k nearest neighbour) uses similar features to predict values of datapoint #KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) #EVALUATION (this is done using the score method in each trained model) # The score method finds the mean accuracy of self.predict(x) with y of the test dataset confidencereg = clfreg.score(X_test, y_test) #for linear reg confidencereg2 = clfpoly2.score(X_test, y_test) #for quadratic reg confidencereg3 = clfpoly3.score(X_test, y_test) #for quadratic reg confidenceregknn = clfknn.score(X_test, y_test) #for knn reg print(confidencereg,confidencereg2,confidencereg3,confidenceregknn) #core (>0.95) for most of the models. However this does not mean we can blindly place our stocks. #There are still many issues to consider, especially with different companies that have different price trajectories over time forecast_set = clf.predict(X_lately) #this needs check dfreg['Forecast'] = np.nan #result should be an array ##Prediction plot last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1)
##y_train = y_train[:1800] ##print x_train.column[1] ##print y_train.column[1] ####y_train_300.astype(dtype=np.float64) ####y_train_300 = np.dtype('f8') ####reg = DecisionTreeRegressor(max_depth=5) ####reg.fit(x_train,y_train) ####predicts = reg.predict(x_test) ####print "total_error:", metrics.mean_squared_error(y_test, predicts) ####print "accuracy:", reg.score(x_train, y_train) reg2 = KNR() reg2.fit(x_all, y_all) ####predicts2 = reg2.predict(x_test) ####print "total_error:", metrics.mean_squared_error(y_test, predicts2) print "accuracy:", reg2.score(x_all, y_all) ####reg3 = svm.SVR() ####reg3.fit(x_train, y_train) ####SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', #### kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) ####predicts3 = reg3.predict(x_test) ####print "total_error:", metrics.mean_squared_error(y_test, predicts3) ####print "accuracy:", reg3.score(x_train, y_train) ##predictions = reg.predict(test_x_all) ##print predicts ###print "total error:", total_error for d in [test_data]: d['Image'] = test_data['Image'].apply(lambda im: np.fromstring(im, sep=' ')) # stack all test images into one numpy array
MapMean_list = np.array(new_model.get_MapMean_list()[:ratio]) RedMean_list = np.array(new_model.get_RedMean_list()[:ratio]) target_list = np.array(new_model.get_target_list()[:ratio]) interData_list = [] for i in range(len(MapMean_list)): interData_list.append(np.hstack((MapMean_list[i],RedMean_list[i]))) from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor() knn.fit(MapFeature_list,MapMean_list) with open('Map.model','wb') as MP: cPickle.dump(knn,MP) print knn.score(MapFeature_list,MapMean_list) from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor() regressor.fit(RedFeature_list,RedMean_list) with open('Reduce.model','wb') as RP: cPickle.dump(regressor,RP) print regressor.score(RedFeature_list,RedMean_list) knn2 = KNeighborsRegressor(p=1) knn2.fit(interData_list,target_list) with open('Job.model','wb') as JP: cPickle.dump(knn2,JP) print knn2.score(interData_list,target_list)
res = mod_fit.resid fig = sm.qqplot(res) #plt.show() ################# average_score = [] #knn result for k in [5, 10,50,100,150,200, 1000]: X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, y, test_size=0.05, random_state=0) neigh = KNeighborsRegressor(n_neighbors=k) neigh.fit(X_train, y_train) average_score.append(neigh.score(X_test,y_test)) #Comparatively k = 10 is best X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, y, test_size=0.05, random_state=0) neigh = KNeighborsRegressor(n_neighbors=10) neigh.fit(X_train, y_train) predict_result1 = neigh.predict(data) #############################classification subre_list = ['videos', 'todayilearned', 'nba','funny', 'DestinyTheGame', 'AdviceAnimals','hockey','WTF', 'worldnews','pcmasterrace','soccer','anime','gaming','serialpodcast','GlobalOffensive','leagueoflegends','news','nfl','CFB','pics','movies','AskReddit','DotA2']
def knn(df1, features, pred_var, df2): cl = KNeighborsRegressor(n_neighbors=3) cl.fit(df1[features], df1[pred_var]) print 'KNN Score: ', cl.score(df2[features], df2[pred_var])
'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker', 'non-striker', 'total' ] for i in l: outliers_removed = remove_outliers(dataset, i) print(outliers_removed) #dropeddata.to_csv('dropeddata.csv',index=False) outliers_removed.to_csv('outliers_removed.csv', index=False) #r square= 88.95 and custome is 86.42 from sklearn.neighbors import KNeighborsRegressor neigh = KNeighborsRegressor(n_neighbors=1) neigh.fit(X_train, y_train) y_pred = neigh.predict(X_test) score9 = neigh.score(X_train, y_train) * 100 print("R Squre value:", score9) print("Custome accuracy for KNeighborsRegressor:", custom_accuracy(y_test, y_pred, 20)) # Testing with a custom input import numpy as np new_prediction = neigh.predict(sc.transform(np.array([[100, 0, 13, 50, 50]]))) print("Prediction score:", new_prediction) models = [ 'RandomForestRegression', 'LinearRegression', 'Losso', 'GaussianNB', 'DecisionTreeRegressor', 'KNeighborsRegression', 'SupportVectorMachine' ] acc_score = [0.77, 0.43, 0.27, 0.37, 0.78, 0.87, 0.49] plt.rcParams['figure.figsize'] = (15, 7) plt.bar(models,
if labels_train[ii] == 1 ] bumpy_slow = [ features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii] == 1 ] #### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsRegressor neigh = KNeighborsRegressor(n_neighbors=2) neigh.fit(features_train, labels_train) accuracy = neigh.score(features_test, labels_test) print accuracy try: prettyPicture(clf, features_test, labels_test) except NameError: pass
# create two random wine sets wine_set_1 = numpredict.wine_set_1() wine_set_2 = numpredict.wine_set_2() # break these sets into training data and testing data train1, test1 = numpredict.divide_data(wine_set_1, test=0.07) train2, test2 = numpredict.divide_data(wine_set_2, test=0.07) # format the sets into numpy arrays suitable for scikit-learn train1_X, train1_y = get_pair(train1) test1_X, test1_y = get_pair(test1) train2_X, train2_y = get_pair(train2) test2_X, test2_y = get_pair(train2) # create two regressors knn1 = KNeighborsRegressor() knn2 = KNeighborsRegressor() # train them using the training sets knn1.fit(train1_X, train1_y) knn2.fit(train2_X, train2_y) # check out their scores print "Accuracy score for predications made on the first wine set:", print "%0.2f%%" % (knn1.score(test1_X, test1_y) * 100) print "Accuracy score for predications made on the second wine set:", print "%0.2f%%" % (knn2.score(test2_X, test2_y) * 100)
X, y = mglearn.datasets.make_wave(n_samples=100) plt.scatter(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Instantiate the model, set the number of neighbors to consider to 5: reg = KNeighborsRegressor(n_neighbors=5) # Fit the model using the training data and training targets: reg.fit(X_train, y_train) KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') reg.score(X_test, y_test) fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) plt.suptitle("nearest_neighbor_regression") for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3 or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y) ax.plot(X, y, 'o') ax.plot(X, -3 * np.ones(len(X)), 'o') ax.plot(line, reg.predict(line)) ax.set_title("%d neighbor(s)" % n_neighbors)
# Exact loop as before training_accuracy = [] test_accuracy = [] neighbors_settings = range(1, 51) for n_neighbors in neighbors_settings: # build the model clf = KNeighborsRegressor(n_neighbors = n_neighbors) clf.fit(X_train, y_train) # record training set accuracy training_accuracy.append(clf.score(X_train, y_train)) # record generalization accuracy test_accuracy.append(clf.score(X_test, y_test)) plt.plot(neighbors_settings, training_accuracy, label = "training accuracy") plt.plot(neighbors_settings, test_accuracy, label = "test accuracy") plt.ylabel("Accuracy") plt.xlabel("n_neighbors") plt.legend() # Printing highest test accuracy print(test_accuracy.index(max(test_accuracy)) + 1)
# print(mean_squared_error(y_test, predictColumn)) # PCA pca = decomposition.PCA() pca.fit(X_train) print(pca.explained_variance_) pca.n_components = 5 reducedTrainSet = pca.fit_transform(X_train) pcaClf = linear_model.LinearRegression() pcaClf.fit(reducedTrainSet,y_train) reducedTestSet = pca.transform(X_test) print(pcaClf.score(reducedTestSet,y_test)) predictColumn = pcaClf.predict(reducedTestSet) print(mean_squared_error(y_test, predictColumn)) print('KNN') neighReg = KNRegressor(n_neighbors = 10) neighReg.fit(X_train,y_train) print(neighReg.score(X_test,y_test)) predictColumn = neighReg.predict(X_test) print(mean_squared_error(y_test, predictColumn)) # # Latent Discriminant Analysis # linDA = LDA() # linDA.fit(X_train,y_train)
def get_predictability(X, y, dtype='continuous'): """Returns scores for various models when given a dataframe and target set Arguments: X (dataframe) y (series) dtype (str): categorical or continuous Note: X and y must be equal in column length Returns: results (dataframe) """ M = pd.concat([X, y], axis=1) fortrain = M.dropna() X_ft = fortrain.iloc[:, :-1] y_ft = fortrain.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X_ft, y_ft, test_size=0.1) # use mean as the prediction y_train_mean = y_train.mean() y_pred_mean = np.zeros(len(y_test)) y_pred_mean.fill(y_train_mean) # use median as the prediction y_train_median = y_train.median() y_pred_median = np.zeros(len(y_test)) y_pred_median.fill(y_train_median) # use mode as the prediction # zero index is required to return the first most common value y_train_mode = y_train.mode()[0] y_pred_mode = np.zeros(len(y_test)) y_pred_mode.fill(y_train_mode) lm = LinearRegression() print("Fitting linear regression model") lm.fit(X_train, y_train) rf = RandomForestRegressor() print("Fitting random forest model") rf.fit(X_train, y_train) kN = KNeighborsRegressor() print("Fitting kNN model") kN.fit(X_train, y_train) # get the r2 score for each model mean_score = r2_score(y_test, y_pred_mean) median_score = r2_score(y_test, y_pred_median) mode_score = r2_score(y_test, y_pred_mode) lm_score = lm.score(X_test, y_test) rf_score = rf.score(X_test, y_test) kN_score = kN.score(X_test, y_test) # get the mse for each model mean_mse = mean_squared_error(y_test, y_pred_mean) median_mse = mean_squared_error(y_test, y_pred_median) mode_mse = mean_squared_error(y_test, y_pred_mode) lm_y_pred = lm.predict(X_test) rf_y_pred = rf.predict(X_test) kN_y_pred = kN.predict(X_test) lm_mse = mean_squared_error(y_test, lm_y_pred) rf_mse = mean_squared_error(y_test, rf_y_pred) kN_mse = mean_squared_error(y_test, kN_y_pred) # construct the dataframe to return to the user names = [ 'mean', 'median', 'mode', 'LinearRegression', 'RandomForestRegressor', 'KNeighborsRegressor' ] scores = [ mean_score, median_score, mode_score, lm_score, rf_score, kN_score ] losses = [mean_mse, median_mse, mode_mse, lm_mse, rf_mse, kN_mse] results = pd.DataFrame(data=list(zip(names, scores, losses)), columns=['names', 'r2 score', 'loss']) results['r2 score'] = results['r2 score'].apply(lambda x: round(x, 0)) results['loss'] = results['loss'].apply(lambda x: round(x, 0)) return results
import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsRegressor n_dots = 80 X = 5 * np.random.rand(n_dots, 1) y = np.cos(X).ravel() y += 0.2 * np.random.rand(n_dots) - 0.1 k = 5 knn = KNeighborsRegressor() knn.fit(X, y) # 生成足够密集的点并进行预测 T = np.linspace(0, 5, 500)[:, np.newaxis] # np.newaxis 为 numpy.ndarray(多维数组)增加一个轴 y_pred = knn.predict(T) knn.score(X, y) # 画出拟合曲线 plt.figure(figsize=(8, 6)) plt.scatter(X, y, c='g', label='data', s=100) # 画出训练样本 plt.plot(T, y_pred, c='k', label='prediction', lw=4) # 画出拟合曲线 plt.axis('tight') plt.title("KNeighborsRegressor (k = %i)" % k) plt.show()
from sklearn.neighbors import KNeighborsRegressor import mglearn from sklearn.model_selection import train_test_split X, y = mglearn.datasets.make_wave(n_samples=40) # split data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # instantiate the model, set the number of neighbors to consider to 3 reg = KNeighborsRegressor(n_neighbors=3) # Fit the model using the training data and training targets reg.fit(X_train, y_train) KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='uniform') print(reg.predict(X_test)) print(reg.score(X_test, y_test))
#res = DataFrame(res.toarray(), columns=features, index=observations) #print len(res) #print len(values) clf = KNeighborsRegressor(n_neighbors=5,weights='distance') clf.fit(df,values) Y_train = clf.predict(df) Y = clf.predict(test_df) Y = [element for element in list for list in Y] print clf.score(df,values) #output_file.write('Id,Hazard\n') output_file.write('Actual,Predicted\n') diff_count = 0 for id,predicted in zip(list(test_df_id.values.flatten()),Y): output_file.write(str(id)+','+str(predicted)+'\n') #for id,predicted in zip(list(test_df_id.values.flatten()),Y_train): # output_file.write(str(id)+','+str(predicted)+'\n')
plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test) plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test) plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test) # ### Regression # In[ ]: from sklearn.neighbors import KNeighborsRegressor X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0) knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train) print(knnreg.predict(X_test)) print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test))) # In[ ]: fig, subaxes = plt.subplots(1, 2, figsize=(8, 4)) X_predict_input = np.linspace(-3, 3, 50).reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state=0) for thisaxis, K in zip(subaxes, [1, 3]): knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train) y_predict_output = knnreg.predict(X_predict_input) thisaxis.set_xlim([-2.5, 0.75]) thisaxis.plot(X_predict_input, y_predict_output,
# Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) # Testing the different models with 80-20 rule for checking the confidence level of the system print('The linear regression confidence is ', confidencereg) print('The quadratic regression 2 confidence is ', confidencepoly2) print('The quadratic regression 3 confidence is ', confidencepoly3) print('The knn regression confidence is ', confidenceknn) # this could be any prediction model. forecast_set = clfreg.predict(X_lately) dfreg['Forecast'] = np.nan last_date = dfreg.iloc[-1].name print(last_date) last_unix = last_date
l_svr = SVR(kernel='linear') l_svr.fit(X_train, Y_train) print(l_svr.score(X_test, Y_test)) n_svr = SVR(kernel="poly") n_svr.fit(X_train, Y_train) print(n_svr.score(X_test, Y_test)) r_svr = SVR(kernel="rbf") r_svr.fit(X_train, Y_train) print(r_svr.score(X_test, Y_test)) from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(weights="uniform") knn.fit(X_train, Y_train) print(knn.score(X_test, Y_test)) from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor() rfr.fit(X_train, Y_train) print(rfr.score(X_test, Y_test)) lr = LinearRegression() lr.fit(X_train, Y_train) print(lr.score(X_test, Y_test)) ''' #载入模型 models = {} models['LR'] = LogisticRegression() models['LDA'] = LinearDiscriminantAnalysis() models['KNN'] = KNeighborsClassifier()
############################################################# # Part 4 # In[] implementing knn regression from sklearn.neighbors import KNeighborsRegressor x_train80, x_test80, y_train80, y_test80 = train_test_split(x_home, y_home, random_state=0) knnreg = KNeighborsRegressor(n_neighbors=5).fit(x_train80, y_train80) print(knnreg.predict(x_test80)) print('R-squared test score: {:.3f}'.format(knnreg.score(x_test80, y_test80))) # In[]: selecting the best value of k # plot k-NN regression on sample dataset for different values of K fig, subaxes = plt.subplots(5, 1, figsize=(5, 20)) #X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1) X_train98, X_test98, y_train98, y_test98 = train_test_split(x_home, y_home, test_size=0.20, random_state=0) X_train99, X_test99, y_train99, y_test99 = train_test_split(x_home, y_home, test_size=0.25, random_state=0) X_train100, X_test100, y_train100, y_test100 = train_test_split(x_home,
#!/usr/bin/env python #-*- coding = utf-8 -*- import numpy as np import pandas as pd import mglearn import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split X,y = mglearn.datasets.make_wave(n_samples=40) X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0) #分析KNeighborsRegressor from sklearn.neighbors import KNeighborsRegressor fig, axes = plt.subplots(1,3,figsize=(15,4)) #创建1000个数据点,在-3和3之间均匀分布.(linspace函数) line = np.linspace(-3,3,1000).reshape(-1,1) for n_eighbors,ax in zip([1,3,9], axes): #利用1个、3个、9个邻居进行预测 reg = KNeighborsRegressor(n_neighbors=n_eighbors) reg.fit(X_train, y_train)#拟合 ax.plot(line,reg.predict(line))#预测 ax.plot(X_train,y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test,y_test,'v',c=mglearn.cm2(1),markersize=8) ax.set_title("{} neighbors(s)\n train score:{:.2f} test score:{:.2f}".format(n_eighbors, reg.score(X_train,y_train),reg.score(X_test,y_test))) ax.set_xlabel("Feature") ax.set_ylabel("Target") axes[0].legend(["Model predictions", "Training data/target", "Teat data/target"], loc="best") plt.show()
class AutoEnRegressor: def __init__(self, LA=True, SVR=False, RF=True, AB=False, KNN=False, random_state=0, GridSearch=False, scoring='r2'): self.__LA = LA self.__SVR = SVR self.__RF = RF self.__AB = AB self.__KNN = KNN self.__random_state = random_state self.__GridSearch = GridSearch if not GridSearch: warnings.warn('model will use RandomizedSearch') self.__scoring = scoring def fit(self, X_train, y_train, validation_split=0.2, validation_data=False): self.__storing_model_names = [] self.__X_train = X_train self.__y_train = y_train if validation_data: self.__X_test = validation_data[0] self.__y_test = validation_data[1] else: self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split( X_train, y_train, test_size=validation_split, random_state=self.__random_state) if self.__LA: AutoEnRegressor.LA_model_fit(self, param_grid=None) self.__storing_model_names.append('LA_score') if self.__SVR: AutoEnRegressor.SVR_model_fit(self, param_grid=None) self.__storing_model_names.append('SVR_score') if self.__RF: AutoEnRegressor.RF_model_fit(self, param_grid=None) self.__storing_model_names.append('RF_score') if self.__AB: AutoEnRegressor.AB_model_fit(self, param_grid=None) self.__storing_model_names.append('AB_score') if self.__KNN: AutoEnRegressor.KNN_model_fit(self, list_neighbors=None) self.__storing_model_names.append('KNN_score') AutoEnRegressor.find_best(self) def LA_model_fit(self, param_grid=None): from sklearn.linear_model import Lasso LA_model = Lasso() if param_grid == None: parameters = {'alpha': [0.01, 0.5, 1, 2, 5]} if self.__GridSearch: self.__LA_model = GridSearchCV(estimator=LA_model, param_grid=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__LA_model = RandomizedSearchCV( estimator=LA_model, param_distributions=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: if self.__GridSearch: self.__LA_model = GridSearchCV(estimator=LA_model, param_grid=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__LA_model = RandomizedSearchCV( estimator=LA_model, param_distributions=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) self.__LA_model.fit(self.__X_train, self.__y_train) print( f'LA_score : {r2_score(self.__y_test,self.__LA_model.predict(self.__X_test))}' ) def SVR_model_fit(self, param_grid=None): from sklearn.svm import SVR SVR_model = SVR() if param_grid == None: parameters = [{ 'kernel': ['rbf', 'poly'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }] if self.__GridSearch: self.__SVR_model = GridSearchCV(estimator=SVR_model, param_grid=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__SVR_model = RandomizedSearchCV( estimator=SVR_model, param_distributions=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: if self.__GridSearch: self.__SVR_model = GridSearchCV(estimator=SVR_model, param_grid=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__SVR_model = RandomizedSearchCV( estimator=SVR_model, param_distributions=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) self.__SVR_model.fit(self.__X_train, self.__y_train) print( f'SVR_score : {r2_score(self.__y_test,self.__SVR_model.predict(self.__X_test))}' ) def RF_model_fit(self, param_grid=None): from sklearn.ensemble import RandomForestRegressor RF_model = RandomForestRegressor() if param_grid == None: parameters = { 'n_estimators': [10, 50, 100, 500], 'max_depth': [4, 8, 10, 12, 16], 'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5] } if self.__GridSearch: self.__RF_model = GridSearchCV(estimator=RF_model, param_grid=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__RF_model = RandomizedSearchCV( estimator=RF_model, param_distributions=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: if self.__GridSearch: self.__RF_model = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__RF_model = RandomizedSearchCV( estimator=RF_model, param_distributions=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) self.__RF_model.fit(self.__X_train, self.__y_train) print( f'RF_score : {r2_score(self.__y_test,self.__RF_model.predict(self.__X_test))}' ) def AB_model_fit(self, param_grid=None): from sklearn.ensemble import AdaBoostRegressor AB_model = AdaBoostRegressor() if param_grid == None: parameters = { 'n_estimators': [10, 50, 100, 500], 'learning_rate': [0.01, 0.5, 0.1, 0.15, 0.2], } if self.__GridSearch: self.__AB_model = GridSearchCV(estimator=AB_model, param_grid=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__AB_model = RandomizedSearchCV( estimator=AB_model, param_distributions=parameters, cv=5, scoring=self.__scoring, n_jobs=-1) else: if self.__GridSearch: self.__AB_model = GridSearchCV(estimator=AB_model, param_grid=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) else: self.__AB_model = RandomizedSearchCV( estimator=AB_model, param_distributions=param_grid, cv=5, scoring=self.__scoring, n_jobs=-1) self.__AB_model.fit(self.__X_train, self.__y_train) print( f'AB_score : {r2_score(self.__y_test,self.__AB_model.predict(self.__X_test))}' ) def KNN_model_fit(self, list_neighbors=None): from sklearn.neighbors import KNeighborsRegressor if list_neighbors == None: list_neighbors = [3, 5, 7, 9, 11, 13, 15] n_neighbor_score_model = [None, 0, None] for neighbor in list_neighbors: self.__KNN_model = KNeighborsRegressor(n_neighbors=neighbor) self.__KNN_model = self.__KNN_model.fit( self.__X_train, self.__y_train) model_score = self.__KNN_model.score(self.__X_test, self.__y_test) if model_score > n_neighbor_score_model[1]: n_neighbor_score_model[0] = neighbor n_neighbor_score_model[1] = model_score n_neighbor_score_model[2] = self.__KNN_model self.__KNN_model = n_neighbor_score_model[2] y_predict = self.__KNN_model.predict(self.__X_test) print( f'KNN_score with {n_neighbor_score_model[0]} neighbors: {r2_score(self.__y_test,y_predict)}' ) def find_best(self): global combinations combinations = [] Total_models = self.__LA + self.__SVR + self.__RF + self.__KNN + self.__AB combinations = np.array(find_all_combinations(Total_models)) all_proba = [] count = 1 self.__best_score = [0] + [None] * Total_models if self.__LA: LA_model_y_predict_proba = self.__LA_model.predict(self.__X_test) all_proba.append(LA_model_y_predict_proba) if self.__best_score[count] == None: count += 1 if self.__SVR: SVR_model_y_predict_proba = self.__SVR_model.predict(self.__X_test) all_proba.append(SVR_model_y_predict_proba) if self.__best_score[count] == None: count += 1 if self.__RF: RF_model_y_predict_proba = self.__RF_model.predict(self.__X_test) all_proba.append(RF_model_y_predict_proba) if self.__best_score[count] == None: count += 1 if self.__AB: AB_model_y_predict_proba = self.__AB_model.predict(self.__X_test) all_proba.append(AB_model_y_predict_proba) if self.__best_score[count] == None: count += 1 if self.__KNN: KNN_model_y_predict_proba = self.__KNN_model.predict(self.__X_test) all_proba.append(KNN_model_y_predict_proba) if self.__best_score[count] == None: count += 1 all_proba = np.array(all_proba) all_proba = np.sum(np.multiply(combinations.T, np.array([all_proba]).T).T, axis=1) for y_predict, comb in zip(all_proba, combinations): latest_score = r2_score(self.__y_test, y_predict) if latest_score > self.__best_score[0]: self.__best_score[0] = latest_score for i in range(0, len(comb)): self.__best_score[i + 1] = comb[i] print(f'AutoEn_score : {self.__best_score[0]}') for i in range(len(self.__storing_model_names)): print( f'weight for {self.__storing_model_names[i]} : {self.__best_score[i+1]}' ) def predict(self, X_test): all_proba = [] count = 1 try: if self.__LA: LA_model_y_predict = self.__LA_model.predict(X_test) LA_model_y_predict = np.multiply(LA_model_y_predict, self.__best_score[count]) all_proba.append(LA_model_y_predict) count += 1 if self.__SVR: SVR_model_y_predict = self.__SVR_model.predict(X_test) SVR_model_y_predict = np.multiply(SVR_model_y_predict, self.__best_score[count]) all_proba.append(SVR_model_y_predict) count += 1 if self.__RF: RF_model_y_predict = self.__RF_model.predict(X_test) RF_model_y_predict = np.multiply(RF_model_y_predict, self.__best_score[count]) all_proba.append(RF_model_y_predict) count += 1 if self.__AB: AB_model_y_predict = self.__AB_model.predict(X_test) AB_model_y_predict = np.multiply(AB_model_y_predict, self.__best_score[count]) all_proba.append(AB_model_y_predict) count += 1 if self.__KNN: KNN_model_y_predict = self.__KNN_model.predict(X_test) KNN_model_y_predict = np.multiply(KNN_model_y_predict, self.__best_score[count]) all_proba.append(KNN_model_y_predict) count += 1 y_predict = np.sum(all_proba, axis=0) except AttributeError: print('model not fitted yet') return None except: print('something went wrong') return None return y_predict
mlr_train_pred = mlr.predict(x_train) mlr_test_pred = mlr.predict(x_test) # Evaluate the MLR model mlr_rsq = mlr.score(x_train, y_train) mlr_test_rsq = mlr.score(x_test, y_test) mlr_rmse = np.sqrt(mean_squared_error(y_train, mlr_train_pred)) mlr_test_rmse = np.sqrt(mean_squared_error(y_test, mlr_test_pred)) # Create the nonlinear model using the KNN knn = KNeighborsRegressor().fit(x_train, y_train) knn_train_pred = knn.predict(x_train) knn_test_pred = knn.predict(x_test) # Evaluate the KNN model knn_rsq = knn.score(x_train, y_train) knn_test_rsq = knn.score(x_train, y_train) knn_rmse = np.sqrt(mean_squared_error(y_train, knn_train_pred)) knn_test_rmse = np.sqrt(mean_squared_error(y_test, knn_test_pred)) # Model evaluation using theR2 and the RMSE metrics print( '\nMLR R-Squared : {:.3f} for the training set, and {:.3f} for the testing set' .format(mlr_rsq, mlr_test_rsq)) print( 'MLR RMSE : {:.3f} for the training set, and {:.3f} for the testing set\n'. format(mlr_rmse, mlr_test_rmse)) print( 'KNN R-Squared : {:.3f} for the training set, and {:.3f} for the testing set' .format(knn_rsq, knn_test_rsq)) print('KNN RMSE : {:.3f} for the training set, and {:.3f} for the testing set'.
data = np.array(df) inputs = data[:, 0:2] outputs = data[:, 8] #Split into train and test set from sklearn.cross_validation import train_test_split X, X_test, y, y_test = train_test_split(inputs, outputs) #Weightd KNN from sklearn.neighbors import KNeighborsRegressor reg = KNeighborsRegressor() reg.fit(X, y) y_pred = reg.predict(X_test) print('R^2 Score : ', reg.score(X_test, y_test)) accuracy = (np.sum(1 - abs((y_test - y_pred) / y_test)) / (y_test.size)) * 100 print('Acuracy : ', accuracy) #Plot of Cu variation along LAT and LONG fig0 = plt.figure(figsize=(20, 10)) fig0.canvas.set_window_title('Variation Of Cu with Latitude and Longitude') plt.subplots_adjust(hspace=0.5) plt.subplot(2, 1, 1) plt.scatter(X[:, 0], y) plt.title('Variation of Cu with Latitude') plt.xlabel('Latitude') plt.ylabel('Cu') plt.subplot(2, 1, 2)
# plt.plot(kneighbors, testscore, label='test') # plt.legend() # plt.show() # k = 3이 적당함을 그래프를 통해 알 수 있다. #forge 데이터 셋 - KNN분류 : 0, 1등으로 분류 # X,y = mglearn.datasets.make_forge() # mglearn.discrete_scatter(X[:, 0], X[:, 1], y) # plt.show() #wave 데이터 셋 - KNN회귀 : 실제값을 예측. 즉, 이웃을 사용해서 그것의 평균이 예측값이 됨 X, y = mglearn.datasets.make_wave(n_samples=40) plt.plot(X, y, 'o') plt.show() mglearn.plots.plot_knn_classification(n_neighbors=3) plt.show() mglearn.plots.plot_knn_regression(n_neighbors=3) plt.show() # 회귀분석 간단예제 X = [[1], [2], [3], [4], [5]] y = [0, 0, 1, 1, 1.5] rgr = KNeighborsRegressor(n_neighbors=3) rgr.fit(X, y) print('훈련 측정값 R^2', rgr.score(X, y)) #생성된 회귀모형을 검증 print(rgr.predict([[1.6], [1.7], [2.3], [3.5]]))
names=l3, na_values="?") x3 = dataset3.iloc[:, 0:25].values y3 = dataset3["Price"].values Imp = SimpleImputer( strategy="most_frequent") # Simple Imputer (use before label encoding) x3[:, 0:25] = Imp.fit_transform(x3[:, 0:25]) y3 = Imp.fit_transform(y3.reshape(-1, 1)) labelencoder = LabelEncoder() x3[:, 2] = labelencoder.fit_transform(x3[:, 2].astype(str)) x3[:, 3] = labelencoder.fit_transform(x3[:, 3].astype(str)) x3[:, 4] = labelencoder.fit_transform(x3[:, 4].astype(str)) x3[:, 5] = labelencoder.fit_transform(x3[:, 5].astype(str)) x3[:, 6] = labelencoder.fit_transform(x3[:, 6].astype(str)) x3[:, 7] = labelencoder.fit_transform(x3[:, 7].astype(str)) x3[:, 8] = labelencoder.fit_transform(x3[:, 8].astype(str)) x3[:, 14] = labelencoder.fit_transform(x3[:, 14].astype(str)) x3[:, 15] = labelencoder.fit_transform(x3[:, 15].astype(str)) x3[:, 17] = labelencoder.fit_transform(x3[:, 17].astype(str)) X_train, X_test, Y_train, Y_test = train_test_split(x3, y3) df = KNeighborsRegressor(n_neighbors=3) df.fit(X_train, Y_train) df.score(X_train, Y_train) df.score(X_test, Y_test) mnist = fetch_mldata('MNIST original')
print(f"y_test[0:20]:{y_test[0:20]}") print(f"y_pre[0:20]:{y_pre[0:20]}") #모델구성-2 print("-"*30) print("KNeighborsRegressor") model2 = KNeighborsRegressor(1) #트레이닝 # model.compile(loss="categorical_crossentropy",metircs=["acc"]) model2.fit(x_train,y_train) score = model2.score(x_test,y_test) #test # loss,ac=model.evaluate(x_test,y_test) y_pre = model2.predict(x_test) # print(f'r2:{r2_score(y_test,y_pre)}') print(f"score:{score}") print(f"r2:{r2_score(y_test,y_pre)}") print(f"acc:{acc(y_test,y_pre)}") print(f"y_test[0:20]:{y_test[0:20]}") print(f"y_pre[0:20]:{y_pre[0:20]}") #모델구성-3
print(test_array.shape) test_array = test_array.reshape(2, 2) print(test_array.shape) train_input = train_input.reshape(-1, 1) test_input = test_input.reshape(-1, 1) print(train_input.shape, test_input.shape) # 결정계수 R**2 from sklearn.neighbors import KNeighborsRegressor knr = KNeighborsRegressor() knr.fit(train_input, train_target) knr.score(test_input, test_target) # 0.9928 from sklearn.metrics import mean_absolute_error test_prediction = knr.predict(test_input) mae = mean_absolute_error(test_target, test_prediction) print(mae) # 19.157 # 과대적합 vs 과소적합 print(knr.score(train_input, train_target)) # 0.9698 knr.n_neighbors = 3 knr.fit(train_input, train_target) print(knr.score(train_input, train_target)) # 0.980 print(knr.score(test_input, test_target))
dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test) #KNN from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor() knn = KNeighborsRegressor(algorithm='brute') knn.fit(X_train, y_train) knn.score(X_train, y_train) knn.score(X_test, y_test) #votingRegressor from sklearn.ensemble import VotingRegressor reg1 = GradientBoostingRegressor() reg2 = RandomForestRegressor() reg3 = LinearRegression() reg4 = DecisionTreeRegressor() reg5 = KNeighborsRegressor() reg6 = AdaBoostRegressor() ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2)]) ereg = ereg.fit(X_train, y_train) ereg.score(X_train, y_train) ereg.score(X_test, y_test)
from sklearn.tree import DecisionTreeRegressor import matplotlib.pyplot as plt import numpy as np X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=11) # instantiate the model and set the number of neighbors to consider to 3 reg_knn = KNeighborsRegressor(n_neighbors=3) # fit the model using the training data and training targets reg_knn.fit(X_train, y_train) print("Test set predictions:\n", reg_knn.predict(X_test)) print("Test set R^2: {:.2f}".format(reg_knn.score(X_test, y_test))) from sklearn.tree import DecisionTreeRegressor tree = DecisionTreeRegressor(max_depth=10).fit(X_train, y_train) print("Test set predictions:\n", tree.predict(X_test)) print("Test set R^2: {:.2f}".format(tree.score(X_test, y_test))) check = X_test.copy() check['Y_hat'] = tree.predict(X_test) check['Y_test'] = y_test check['diff'] = check['Y_test'] - check['Y_hat']
# k-Nearest Neighbor Regression import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsRegressor # load the datasets dataset = datasets.load_diabetes() # fit a model to the data model = KNeighborsRegressor() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
# instantiate the model and set the number of neighbors to consider to 3 reg=KNeighborsRegressor(n_neighbors=3) # fit the model using the training data and training targets reg.fit(X_train,y_train) print('test set predictions : \n{}'.format(reg.predict(X_test))) """ We can also evaluate the model using the score method, which for regressors returns the R2 score. The R2 score, also known as the coefficient of determination, is a measure of goodness of a prediction for a regression model, and yields a score between 0 and 1. A value of 1 corresponds to a perfect prediction, and a value of 0 corresponds to a constant model that just predicts the mean of the training set responses, y_train """ print('Test set R^2 : {:.2f}'.format(reg.score(X_test,y_test))) # Analyzing KNeighborsRegressor------------------------------------------------ fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # create 1,000 data points, evenly spaced between -3 and 3 line = np.linspace(-3, 3, 1000).reshape(-1, 1) for n_neighbors, ax in zip([1, 3, 9], axes): # make predictions using 1, 3, or 9 neighbors reg = KNeighborsRegressor(n_neighbors=n_neighbors) reg.fit(X_train, y_train) ax.plot(line, reg.predict(line)) ax.plot(X_train, y_train, '^', c=mglearn.cm2(0), markersize=8) ax.plot(X_test, y_test, 'v', c=mglearn.cm2(1), markersize=8) ax.set_title("{} neighbor(s)\n train score: {:.2f} test score: {:.2f}".format( n_neighbors, reg.score(X_train, y_train), reg.score(X_test, y_test)))
mse = mean_squared_error(y_test, lasso_pred) print("Root Mean Squared Error: ", np.sqrt(mse)) fig = plt.figure(figsize=[10, 8]) ax = plt.subplot(111) ax.plot(y_test.index, lasso_pred, label='Predicted') ax.plot(y_test, label='Test') ax.legend() plt.show() # Evaluation confidence_lr = lr.score(X_test, y_test) confidence_poly2 = poly2.score(X_test, y_test) confidence_poly3 = poly3.score(X_test, y_test) confidence_knn = knn.score(X_test, y_test) confidence_lasso = lasso.score(X_test, y_test) print("Results: ", confidence_lr, confidence_poly2, confidence_poly3, confidence_knn, confidence_lasso) # all on one graph fig = plt.figure(figsize=[10, 8]) ax = plt.subplot(111) ax.plot(y_test.index, lasso_pred, label='Lasso', color='red') ax.plot(y_test.index, knn_pred, label='KNN', color='blue') ax.plot(y_test.index, poly2_pred, label='Poly2', color='green') ax.plot(y_test.index, poly3_pred, label='Poly3', color='orange') ax.plot(y_test.index, y_pred_lr, label='LR', color='cyan') ax.plot(y_test, label='Test', color='magenta')
# Linear Regression lin3 = LR() #lin3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) lin3.fit(X_train, y_train) print "Train: ", lin3.score(X_train, y_train) print "Test: ", lin3.score(X_test, y_test) print "Intercept: ", lin3.intercept_ for k, v in enumerate(lin3.coef_[0]): print threeYrXcol[k], ": ", v # KNeighborsRegressor kn3 = KNReg(weights='uniform') #kn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) kn3.fit(X_train, y_train) print "Train: ", kn3.score(X_train, y_train) print "Test: ", kn3.score(X_test, y_test) # print kn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # RadiusNeighborsRegressor rn3 = RNReg(radius=7.0) #rn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) rn3.fit(X_train, y_train) print "Train: ", rn3.score(X_train, y_train) print "Test: ", rn3.score(X_test, y_test) print rn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # Test 2010/11/12 stats and 2013 projections against 2013 actuals y=2013 y3 = [y-1,y-2,y-3] tms_include = np.intersect1d(df[df.Year == y3[0]].Team.values, df[df.Year == y3[2]].Team.values)
ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) # 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归 uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, Y_train) uni_knr_Y_predict = uni_knr.predict(X_test) # 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归 dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, Y_train) dis_knr_Y_predict = dis_knr.predict(X_test) print('The R-squared value of uniform-weighted KNeighborsRegression is:', uni_knr.score(X_test, Y_test)) # 使用mean_squared_error模块,并输出评估结果 print("The mean squared error of uniform-weighted KNeighborsRegression is:", mean_squared_error(Y_test, uni_knr_Y_predict)) # 使用mean_absolute_error模块,并输出评估结果 print("The mean absolute error of uniform-weighted KNeighborsRegression is:", mean_absolute_error(Y_test, uni_knr_Y_predict)) print('The R-squared value of distance-weighted KNeighborsRegression is:', dis_knr.score(X_test, Y_test)) # 使用mean_squared_error模块,并输出评估结果 print("The mean squared error of distance-weighted KNeighborsRegression is:", mean_squared_error(Y_test, dis_knr_Y_predict)) # 使用mean_absolute_error模块,并输出评估结果 print("The mean absolute error of distance-weighted KNeighborsRegression is:", mean_absolute_error(Y_test, dis_knr_Y_predict))
# 从sklearn.neighbors导入KNeighborRegressor(K近邻回归器)。 from sklearn.neighbors import KNeighborsRegressor # 初始化K近邻回归器,并且调整配置,使得预测的方式为平均回归:weights='uniform'。 uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, y_train) uni_knr_y_predict = uni_knr.predict(X_test) # 初始化K近邻回归器,并且调整配置,使得预测的方式为根据距离加权回归:weights='distance'。 dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, y_train) dis_knr_y_predict = dis_knr.predict(X_test) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。 print('R-squared value of uniform-weighted KNeighorRegression:', uni_knr.score(X_test, y_test)) print( 'The mean squared error of uniform-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))) print('The mean absoluate error of uniform-weighted KNeighorRegression', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))) # 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。 print('R-squared value of distance-weighted KNeighorRegression:', dis_knr.score(X_test, y_test)) print( 'The mean squared error of distance-weighted KNeighorRegression:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))) print('The mean absoluate error of distance-weighted KNeighorRegression:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)) )
# 準備 import numpy as np import pandas as pd import statsmodels.api as sm from sklearn.neighbors import KNeighborsRegressor my_data = sm.datasets.get_rdataset('cars', 'datasets').data X, y = my_data[['speed']], my_data['dist'] # 訓練 my_model = KNeighborsRegressor() my_model.fit(X, y) # 可視化の準備 tmp = pd.DataFrame( {'speed': np.linspace(min(my_data.speed), max(my_data.speed), 100)}) tmp['model'] = my_model.predict(tmp) pd.concat([my_data, tmp]).plot(x='speed', style=['o', '-']) y_ = my_model.predict(X) ((y - y_)**2).mean()**0.5 #> 13.087184571174962 # RMSE my_model.score(X, y) #> 0.7368165812204317 # 決定係数1 np.corrcoef(y, y_)[0, 1]**2 #> 0.7380949412509705 # 決定係数6
# find the most correlated variables cols_x = ['number_of_open_credit_lines_and_loans', 'number_of_dependents', 'number_real_estate_loans_or_lines'] train_not_null = train[cols_x][rows_train] test_not_null = test[cols_x][rows_test] lr = LinearRegression() lr.fit(train_not_null, train['monthly_income'][rows_train]) print lr.score(test_not_null, test['monthly_income'][rows_test]) # score: 0.0478125755667 knn = KNeighborsRegressor(n_neighbors=120) knn.fit(train_not_null, train['monthly_income'][rows_train]) print knn.score(test_not_null, test['monthly_income'][rows_test]) # score: 0.00680687486842 # use linear regression model as imputer # In[10]: train[rows_train].corr().ix[:, 5] # In[11]: train_null = train[cols_x][~rows_train] test_null = test[cols_x][~rows_test]
# k-Nearest Neighbor Regression import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsRegressor # load the datasets dataset = datasets.load_diabetes() # fit a model to the data model = KNeighborsRegressor() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted - expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
def scicrossvalidate(data,k=3): X_train, X_test, y_train, y_test = cross_validation.train_test_split([data[i]['input'] for i in range(len(data))],[data[i]['result'] for i in range(len(data))], test_size=0.4, random_state=0) neigh = KNeighborsRegressor(n_neighbors=k, weights=scigaussian) neigh.fit(X_train,y_train) return neigh.score(X_test,y_test)
ax4.scatter(range(len(y_test)),y_test,label='data') ax4.plot(range(len(y_test)),y_pred_gb,color='black',label='GB model') ax4.legend() f2,(ax5,ax6) = plt.subplots(1,2,figsize=(30,10)) # quardratic poly 2 ax5.scatter(range(len(y_test)),y_test,label='data') ax5.plot(range(len(y_test)),y_pred_qd,color='blue',label='Quadratic model') ax5.legend() # KNN ax6.scatter(range(len(y_test)),y_test,label='data') ax6.plot(range(len(y_test)),y_pred_knn,color='black',label='KNN model') ax6.legend() print("Accuracy of Linear Regerssion Model:",clf_lr.score(x_test,y_test)) print("Accuracy of SVM-RBF Model:",clf_svr.score(x_test,y_test)) print("Accuracy of Random Forest Model:",clf_rf.score(x_test,y_test)) print("Accuracy of Gradient Boosting Model:",clf_gb.score(x_test,y_test)) print("Accuracy of quadratic model:",clfpoly2.score(x_test,y_test)) print("Accuracy of knn Model:",clfknn.score(x_test,y_test))
# distance metrics: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html#sklearn.neighbors.DistanceMetric # KNN http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html print "Regression test..." games_train, games_test, regress_train, regress_test = train_test_split( games_matrix, regress_vec, test_size=TRAIN_SPLIT) n = KNeighborsRegressor( n_neighbors=NEIGHBORS, # saw highest accuracy with 20 algorithm='kd_tree', weights='uniform', # saw highest accuracy with uniform #weights='distance', #metric='minkowski', p=2, n_jobs = 3, # number of CPU cores (-1 for all) ) n.fit(games_train, regress_train) print "Accuracy training data:", n.score(games_train, regress_train) print "Accuracy test data:", n.score(games_test, regress_test) print if not RUN_TOURNAMENT: print "Some predictions:" for i in range(0,10): print " ", n.predict([games_test[i]]), regress_test[i] print print "Classification test..." games_train, games_test, class_train, class_test = train_test_split( games_matrix, class_vec, test_size=TRAIN_SPLIT) n = KNeighborsClassifier( n_neighbors=NEIGHBORS, algorithm='kd_tree', weights='uniform', # saw highest accuracy with uniform
class DataHandler(object): """ Handle the diabetes data set """ def __init__(self): self.times = [] self.labels = None self.features = None self.data_list = parse_all() self.user = self.data_list[0]['data'] self.activities = [] self.dates = [] self.glucose = [] self.k_fold = [] self.classifier = KNeighborsRegressor() def parse_data(self): """ Parse the data_list into our features """ train_data = [] test_data = [] for user in self.data_list: for data in user['data']: # I wan't to compress all the dataset in a 24 hours timeframe time = data['date'].replace(year=2000, month=1, day=1) # Save the features self.dates.append(time) self.activities.append(data['activity']) self.glucose.append(data['glucose']) train_data.append([int(time.strftime("%H")), int(time.strftime("%M")), data['activity']]) test_data.append([float(data['glucose'])]) self.features = np.array(train_data) self.labels = np.array(test_data) def load_k_fold(self): """ Load the test cases into K folds """ self.k_fold = KFold(len(self.data_list), n_folds=len(self.data_list)*0.2) def fit(self): """ Train the classifier with the train data """ self.classifier.fit(self.features, self.labels) def print_score(self): """ calculates the score on the given kfold test cases """ for train, test in self.k_fold: print(self.classifier.score(self.features[test], self.labels[test])) def predict(self, date, activity): """ Predict what will be the glucose rate on a given activity and data """ return self.classifier.predict([int(date.strftime('%H')), int(date.strftime('%M')), activity]) def plot(self): """ Matplotlib it! """ plt.plot(self.dates, self.glucose, 'go') plt.plot(self.dates, self.activities, 'ro') plt.show()