def basicPredictor(vecnumber): procvec, diagvec, diagdict = getFeatures.getFeatures() procvec1 = procvec["Medicare"] diagvec1 = diagvec["Medicare"] procvec2 = procvec["Medicaid"] diagvec2 = diagvec["Medicaid"] procvec3 = procvec["Uninsured"] diagvec3 = diagvec["Uninsured"] keylist = procvec1.keys() keylist.sort() veclist = [] diaglist = [] statenamelist = [] i = vecnumber for key in keylist: vecs = procvec1[key] countvec = vecs[i] veclist.append(countvec) diaglist.append(diagvec1[key]) try: veclist.append(procvec2[key][i]) diaglist.append(diagvec2[key]) except: pass try: veclist.append(procvec3[key][i]) diaglist.append(diagvec3[key]) except: pass statenamelist.append(key) veclist = np.asarray(veclist) diaglist = np.asarray(diaglist) ''' #encodes only top 5 diagnoses for each input vector, does not give good accuracy!! diagindex=np.argpartition(diaglist,-5,axis=1)[:,-5:] hotlist=[] for i in range(len(diaglist)): hot=np.zeros((len(diaglist[0]),),dtype=float) hot[diagindex[i]]=1.0 hotlist.append(hot) hotlist=np.asarray(hotlist) hotlist=preprocessing.normalize(hotlist) ''' #print diagindex hotlist = preprocessing.normalize(diaglist) standardscaler = preprocessing.StandardScaler() vecs = standardscaler.fit_transform(veclist) # for testing how size of training data affects accuracy sizevec = [0.11, 0.22, 0.33, 0.44, 0.55, 0.66] error = [] for i in sizevec: xtrain, xtest, ytrain, ytest = train_test_split(vecs, hotlist, test_size=i, random_state=17) knn = neighbors.KNeighborsRegressor() knn.fit(xtrain, ytrain) predictedys = knn.score(xtest, ytest) error.append(predictedys) print error '''
100, 1000, ), 'weights': ( 'uniform', 'distance', ), 'p': ( 1, 2, ), }] # Choose the regressor est (=estimator) # This should be changed to try different regressors. est = neighbors.KNeighborsRegressor() # Use GridSearch to find the best combination of hyper-parameters gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2') # Train the MLA and take the time t0 = time.time() gs.fit(x_train, y_train.ravel()) runtime = time.time() - t0 print("kNN complexity and bandwidth selected and model fitted in %.6f s" % runtime)
def SVR_train(*data): X, Y = data ####3.1决策树回归#### from sklearn import tree model_DecisionTreeRegressor = tree.DecisionTreeRegressor() ####3.2线性回归#### from sklearn import linear_model model_LinearRegression = linear_model.LinearRegression() ####3.3SVM回归#### from sklearn import svm model_SVR = svm.SVR() model_SVR2 = svm.SVR(kernel='rbf', C=100, gamma=0.1) ####3.4KNN回归#### from sklearn import neighbors model_KNeighborsRegressor = neighbors.KNeighborsRegressor() ####3.5随机森林回归#### from sklearn import ensemble model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) # 这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) # 这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( n_estimators=100) # 这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() # Create the (parametrised) models # print("Hit Rates/Confusion Matrices:\n") models = [ ("model_DecisionTreeRegressor", model_DecisionTreeRegressor), ("model_LinearRegression", model_LinearRegression), ( "model_SVR", model_SVR2 #model_SVR ), ("model_KNeighborsRegressor", model_KNeighborsRegressor), ("model_RandomForestRegressor", model_RandomForestRegressor), ("model_AdaBoostRegressor", model_AdaBoostRegressor), ("model_GradientBoostingRegressor", model_GradientBoostingRegressor), ("model_BaggingRegressor", model_BaggingRegressor), ("model_ExtraTreeRegressor", model_ExtraTreeRegressor) ] for m in models: #X = X.reset_index(drop=True) #print(X) # y = y.reset_index(drop=True) # print(y) from sklearn.model_selection import KFold kf = KFold(n_splits=2, shuffle=False) for train_index, test_index in kf.split(X): # print(train_index, test_index) # print(X.loc[[0,1,2]]) X_train, X_test, y_train, y_test = X[train_index], X[ test_index], Y[train_index], Y[ test_index] # 这里的X_train,y_train为第iFold个fold的训练集,X_val,y_val为validation set #print(X_test, y_test) #print(X_train, y_train) print('======================================') import datetime starttime = datetime.datetime.now() print("正在训练%s模型:" % m[0]) m[1].fit(X_train, y_train) # Make an array of predictions on the test set pred = m[1].predict(X_test) # Output the hit-rate and the confusion matrix for each model score = m[1].score(X_test, y_test) print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test))) # print("%s\n" % confusion_matrix(y_test, pred, labels=[-1.0, 1.0]))#labels=["ant", "bird", "cat"] from sklearn.metrics import r2_score r2 = r2_score(y_test, pred) print('r2: ', r2) endtime = datetime.datetime.now() print('%s训练,预测耗费时间,单位秒:' % m[0], (endtime - starttime).seconds) #result = m[1].predict(X_test) import matplotlib.pyplot as plt plt.figure() plt.plot(np.arange(len(pred)), y_test, 'go-', label='true value') plt.plot(np.arange(len(pred)), pred, 'ro-', label='predict value') plt.title('score: %f' % score) plt.legend() plt.show()
from sklearn.model_selection import train_test_split from sklearn import neighbors from sklearn.datasets import make_regression from matplotlib import pyplot as plt import numpy as np # ----------------- Generate Synthetic Data ---------------# X_R, y_R = make_regression(n_samples=100, n_features=1, n_informative=1, bias=150.0, noise=30) fig, subaxes = plt.subplots(5, 1, figsize=(11, 8), dpi=100) X = np.linspace(-3, 3, 500).reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_R, y_R, random_state=0) # --------------------------- KNN -------------------------# for K, K in zip(subaxes, [1, 3, 7, 15, 59]): knn_reg = neighbors.KNeighborsRegressor(n_neighbors=K) knn_reg.fit(X_train, y_train) y_predict_output = knn_reg.predict(X) plt.plot(X, y_predict_output) plt.plot(X_train, y_train, 'o', alpha=0.9, label='Train') plt.plot(X_test, y_test, '^', alpha=0.9, label='Test') plt.xlabel('Input feature') plt.ylabel('Target value') plt.title('KNN Regression (K={})\n$'.format(K)) plt.legend() plt.show()
# Compute target and add noise y = np.sinc(X).ravel() y += 0.2 * (0.5 - np.random.rand(y.size)) # Plot input data plt.figure() plt.scatter(X, y, s=40, c='k', facecolors='none') plt.title('Input data') # Create the 1D grid with 10 times the density of the input data x_values = np.linspace(-0.5 * amplitude, 0.5 * amplitude, 10 * num_points)[:, np.newaxis] # Number of neighbors to consider n_neighbors = 8 # Define and train the regressor knn_regressor = neighbors.KNeighborsRegressor(n_neighbors, weights='distance') y_values = knn_regressor.fit(X, y).predict(x_values) plt.figure() plt.scatter(X, y, s=40, c='k', facecolors='none', label='input data') plt.plot(x_values, y_values, c='k', linestyle='--', label='predicted values') plt.xlim(X.min() - 1, X.max() + 1) plt.ylim(y.min() - 0.2, y.max() + 0.2) plt.axis('tight') plt.legend() plt.title('K Nearest Neighbors Regressor') plt.show()
def knn(data, startAt, stopAt=None): """ Classifies the point between startAt and stopAt with the k-nearest neighbors method. Automaticaly finds the best number of neighbors. If stopAt is not provided, default value is the length of data. Parameters: data (pandas.DataFrame): Data returned by prepare_data (may be differentiated) startAt (int): Index where the forecast starts stopAt (int): Index where the forecast stops (default is None) Returns: predictions (list): The forecast from startAt up to stopAt """ data_copy = data.copy() if (stopAt is None): stopAt = len(data_copy) periods = stopAt - startAt from fastai.tabular import add_datepart add_datepart(data_copy, 'Date') data_copy.drop('Elapsed', axis=1, inplace=True) # setting importance of days before and after weekends # we assume that fridays and mondays are more important # 0 is Monday, 1 is Tuesday... data_copy['mon_fri'] = 0 data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]), 1, inplace=True) data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]), 0, inplace=True) train = data_copy[:startAt] valid = data_copy[startAt:stopAt] from sklearn import neighbors from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) x_train_scaled = scaler.fit_transform(train.drop('Close', axis=1)) x_train = pd.DataFrame(x_train_scaled) y_train = train['Close'] x_valid_scaled = scaler.fit_transform(valid.drop('Close', axis=1)) x_valid = pd.DataFrame(x_valid_scaled) y_valid = valid['Close'] params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]} knn = neighbors.KNeighborsRegressor() model = GridSearchCV(knn, params, cv=5, iid=False) model.fit(x_train, y_train) predictions = model.predict(x_valid) return predictions
features = ['GrLivArea'] #features = ['TotalBsmtSF'] viz_cont_cont(house_train, features, target) features_to_filter = ['Id'] filter_features(house_train, features_to_filter) #do one-hot-encoding for all the categorical features house_train1 = one_hot_encode(house_train) house_train1.shape house_train1.info() #filter_features(house_train1, ['SalePrice','log_sale_price']) filter_features(house_train1, ['SalePrice']) X_train = house_train1 y_train = house_train['SalePrice'] X_train.shape #Step 1 model rf_estimator = ensemble.RandomForestClassifier(n_estimators=100) feature_imp_df, X_train1 = feature_selection_from_model( rf_estimator, X_train, y_train) X_train1.shape scaled_model = get_scale_model(X_train1) X_train1 = scaled_model.transform(X_train1) knn_estimator = neighbors.KNeighborsRegressor() knn_grid = {'n_neighbors': [15, 20]} model = fit_model(knn_estimator, knn_grid, X_train1, y_train) #Best score: 0.182414942974 #.score: 0.17392101913392907
df["enc_state"] = state_encoder.fit_transform(df["State"]) df["enc_state"] ################ df.head() df.drop("State", axis=1, inplace=True) df.info() X = df.drop("Profit", axis=1) y = df["Profit"] Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split( X, y, test_size=0.15, random_state=42) Xtrain.info() knnmodel = neighbors.KNeighborsRegressor(n_neighbors=11) knnmodel.fit(Xtrain, ytrain) #fit doesnt create model ,but it create a data structure which help us to search easier #kdtree #balltree #brute #alogorithm="........." prediction = knnmodel.predict(Xtest) print(np.sqrt(metrics.mean_squared_error(ytest, prediction))) X[:3] #standard scaling avg = df.rs.mean() sd = df.rs.std()
# result = clf.predict(x_test) # score = mse(y_test,result) # plt.figure() # plt.plot(np.arange(len(result)), y_test,'go-',label='true value') # plt.plot(np.arange(len(result)),result,'ro-',label='predict value') # plt.title('score: %f'%score) # plt.legend() # plt.show() from sklearn import neighbors from sklearn import svm from sklearn import ensemble # rf =ensemble.RandomForestRegressor(n_estimators=20)#这里使用20个决策树 # svr = svm.SVR() knn = neighbors.KNeighborsRegressor(n_neighbors=4) from sklearn.model_selection import KFold cv = KFold(n_splits=5, shuffle=True, random_state=42) results = [] sub_array = [] train = train.values y_train = train_Y.values test1 = test1.values # model xgb _ cv for model in [knn]: for traincv, testcv in cv.split(train, y_train): knn.fit(train[traincv], y_train[traincv]) y_tmp = knn.predict(train[testcv])
def k_nearest_neighbors(other_args: List[str], s_ticker: str, df_stock: pd.DataFrame): """ Train KNN model Parameters ---------- other_args: List[str] List of argparse arguments s_ticker: str Ticker df_stock: pd.DataFrame Dataframe of stock prices Returns ------- """ parser = argparse.ArgumentParser( add_help=False, prog="knn", description=""" K nearest neighbors is a simple algorithm that stores all available cases and predict the numerical target based on a similarity measure (e.g. distance functions). """, ) parser.add_argument( "-i", "--input", action="store", dest="n_inputs", type=check_positive, default=40, help="number of days to use as input for prediction.", ) parser.add_argument( "-d", "--days", action="store", dest="n_days", type=check_positive, default=5, help="prediction days.", ) parser.add_argument( "-j", "--jumps", action="store", dest="n_jumps", type=check_positive, default=1, help="number of jumps in training data.", ) parser.add_argument( "-n", "--neighbors", action="store", dest="n_neighbors", type=check_positive, default=20, help="number of neighbors to use on the algorithm.", ) parser.add_argument( "-e", "--end", action="store", type=valid_date, dest="s_end_date", default=None, help="The end date (format YYYY-MM-DD) to select for testing", ) parser.add_argument( "-t", "--test_size", default=0.2, dest="valid_split", type=float, help="Percentage of data to validate in sample", ) parser.add_argument( "-p", "--pp", action="store", dest="s_preprocessing", default="none", choices=["normalization", "standardization", "minmax", "none"], help="pre-processing data.", ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return ( X_train, X_valid, y_train, y_valid, _, _, _, y_dates_valid, forecast_data_input, dates_forecast_input, scaler, is_error, ) = prepare_scale_train_valid_test(df_stock["5. adjusted close"], ns_parser) if is_error: print("Error preparing data") return print( f"Training on {X_train.shape[0]} sequences of length {X_train.shape[1]}. Using {X_valid.shape[0]} sequences " f" of length {X_valid.shape[1]} for validation") future_dates = get_next_stock_market_days(dates_forecast_input[-1], n_next_days=ns_parser.n_days) # Machine Learning model knn = neighbors.KNeighborsRegressor(n_neighbors=ns_parser.n_neighbors) knn.fit( X_train.reshape(X_train.shape[0], X_train.shape[1]), y_train.reshape(y_train.shape[0], y_train.shape[1]), ) preds = knn.predict(X_valid.reshape(X_valid.shape[0], X_valid.shape[1])) forecast_data = knn.predict(forecast_data_input.reshape(1, -1)) forecast_data_df = pd.DataFrame( [i if i > 0 else 0 for i in forecast_data.T], index=future_dates) print_pretty_prediction(forecast_data_df[0], df_stock["5. adjusted close"].values[-1]) plot_data_predictions( df_stock, preds, y_valid, y_dates_valid, scaler, f"KNN Model with {ns_parser.n_neighbors} Neighbors on {s_ticker}", forecast_data_df, 1, ) except Exception as e: print(e) print("")
from sklearn import model_selection X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size=0.3) # In[30]: # In[55]: from sklearn import neighbors, metrics from matplotlib import pyplot as plt knn= neighbors.KNeighborsRegressor(n_neighbors = 12) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) # In[57]: sizes = {} # clé : coordonnées ; valeur : nombre de points à ces coordonnées for (yt, yp) in zip(list(y_test), list(y_pred)): if (yt, yp) in sizes.keys(): sizes[(yt, yp)] += 1 else: sizes[(yt, yp)] = 1
import pandas as pd import numpy as np import sklearn.cross_validation as cross_val import sklearn.neighbors as neighbors from sklearn.preprocessing import scale from sklearn.datasets import load_boston boston = load_boston() features = scale(boston.data) target = boston.target kf = cross_val.KFold(len(features),n_folds=5, shuffle=True, random_state=42) res = [] for par in np.linspace(1.0, 10.0, num=200): print('p = %f') % par estimator = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance', p=par, metric='minkowski') score = cross_val.cross_val_score(estimator, features, target, cv = kf, scoring='mean_squared_error').mean() res.append(score) print ('score = %f') % score print(sorted(res))
def train_models(data, attrs, Target) -> None: warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=166) #Machine Learning Algorithm (MLA) Selection and Initialization MLA = [ #ensemble.AdaBoostRegressor(), #ensemble.BaggingRegressor(), #ensemble.ExtraTreesRegressor(n_estimators=10), #ensemble.GradientBoostingRegressor(), #XGBRegressor(), #gaussian_process.GaussianProcessRegressor(), ensemble.RandomForestRegressor(n_estimators=30, max_depth=5), linear_model.Ridge(alpha=0.0001), linear_model.Lasso(alpha=0.0001, selection='random'), neighbors.KNeighborsRegressor(), svm.SVR(kernel='rbf', gamma='auto'), tree.DecisionTreeRegressor(max_depth=5), ] #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit( n_splits=10, test_size=.2, train_size=.8, random_state=0 ) # run model 10x with 60/30 split intentionally leaving out 10% #create table to compare MLA metrics MLA_columns = [ 'MLA Name', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time' ] MLA_compare = pd.DataFrame(columns=MLA_columns) #create table to compare MLA predictions MLA_predict = data[Target] data_target = utils.column_or_1d(MLA_predict.values.ravel(), warn=True) data_features = data[attrs] pd.options.mode.chained_assignment = None #index through MLA and save performance to table row_index = 0 for alg in MLA: #set name and parameters MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name # MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) # print(MLA_name) #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate cv_results = model_selection.cross_validate( alg, data_features, data_target, cv=cv_split, scoring='neg_mean_absolute_error', return_train_score=True) MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean() MLA_compare.loc[ row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() MLA_compare.loc[ row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results[ 'test_score'].std() * 3 #let's know the worst that can happen! #save MLA predictions - see section 6 for usage alg.fit(data_features, data_target) MLA_predict[MLA_name] = alg.predict(data_features) row_index += 1 MLA_compare.sort_values(by=['MLA Test Accuracy Mean'], ascending=False, inplace=True) print(MLA_compare)
n_splits = 5 kf = KFold(n_splits=n_splits) for i, (train_ind, val_ind) in enumerate(kf.split(train)): tra = train.drop(['血糖'], axis=1).values[train_ind] tra_label = train['血糖'].values[train_ind] val = train.drop(['血糖'], axis=1).values[val_ind] val_label = train['血糖'].values[val_ind] pred = pd.DataFrame() # lasso lasso = linear_model.Lasso(alpha=0.005455) lasso.fit(tra, tra_label) pred['lasso'] = lasso.predict(val) # knn knn = neighbors.KNeighborsRegressor(n_neighbors=25, weights='uniform', n_jobs=-1) knn.fit(tra, tra_label) pred['knn'] = knn.predict(val) # svr svr = svm.SVR(kernel='rbf', C=10, gamma=0.01) svr.fit(tra, tra_label) pred['svr'] = svr.predict(val) # xgboost dtrain = xgb.DMatrix(tra, label=tra_label) dval = xgb.DMatrix(val) base_score = train['血糖'].sum() / train['血糖'].shape[0] param_gbtree = { 'booster': 'gbtree', 'eta': 0.01, 'gamma': 0,
#print header #X, y = preprocessDataWithoutScale(data) joblib.dump(X_scaler, 'pickle-final/X_scaler.pkl') joblib.dump(y_scaler, 'pickle-final/y_scaler.pkl') joblib.dump(imp, 'pickle-final/Imputer.pkl') joblib.dump(vec, 'pickle-final/Vector.pkl') estimators = [] # K-Nearest Neighbors estimators.append({ "name": "KNN", "model": neighbors.KNeighborsRegressor(weights="uniform", n_neighbors=5) }) # Gradient Boosting Regressor estimators.append({ "name": "GBR", "model": ensemble.GradientBoostingRegressor(max_features=0.1, n_estimators=2100, max_depth=6, min_samples_leaf=1, learning_rate=0.02) }) # Random Forest
def dict_method_reg(): dict_method = {} # 1st part """1SVR""" me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=False, max_iter=2000) cv1 = 5 scoring1 = 'r2' param_grid1 = [{'C': [1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.01, 0.001, 0.0001]}] dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]}) """2BayesianRidge""" me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.01, verbose=False) cv2 = 5 scoring2 = 'r2' param_grid2 = [{'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-05, 1e-03]}] dict_method.update({'BayR-set': [me2, cv2, scoring2, param_grid2]}) """3SGDRL2""" me3 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l2', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv3 = 5 scoring3 = 'r2' param_grid3 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05]}] dict_method.update({'SGDRL2-set': [me3, cv3, scoring3, param_grid3]}) """4KNR""" me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') cv4 = 5 scoring4 = 'r2' param_grid4 = [{'n_neighbors': [3, 4, 5, 6]}] dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]}) """5kernelridge""" kernel = 1.0 * RBF(1.0) me5 = kernel_ridge.KernelRidge(alpha=1, kernel=kernel, gamma="scale", degree=3, coef0=1, kernel_params=None) cv5 = 5 scoring5 = 'r2' param_grid5 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001]}] dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]}) """6GPR""" # kernel = 1.0 * RBF(1.0) kernel = Matern(length_scale=0.1, nu=0.5) me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=10, normalize_y=False, copy_X_train=True, random_state=0) cv6 = 5 scoring6 = 'r2' param_grid6 = [{'alpha': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]}] dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]}) # 2nd part """6RFR""" me7 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False) cv7 = 5 scoring7 = 'r2' param_grid7 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]}) """7GBR""" me8 = ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') cv8 = 5 scoring8 = 'r2' param_grid8 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]}) "AdaBR" dt = DecisionTreeRegressor(criterion="mae", splitter="best", max_features=None, max_depth=3, min_samples_split=2) me9 = AdaBoostRegressor(dt, n_estimators=100, learning_rate=1, loss='square', random_state=0) cv9 = 5 scoring9 = 'r2' param_grid9 = [{'n_estimators': [50, 120, 100, 200]}] dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]}) '''TreeR''' me10 = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False) cv10 = 5 scoring10 = 'r2' param_grid10 = [{'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3, 4]}] dict_method.update({'TreeC-em': [me10, cv10, scoring10, param_grid10]}) 'ElasticNet' me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None) cv11 = 5 scoring11 = 'r2' param_grid11 = [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'l1_ratio': [0.3, 0.5, 0.8]}] dict_method.update({"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]}) 'Lasso' me12 = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.001, warm_start=False, positive=False, random_state=None, ) cv12 = 5 scoring12 = 'r2' param_grid12 = [{'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000]}, ] dict_method.update({"Lasso-L1": [me12, cv12, scoring12, param_grid12]}) """SGDRL1""" me13 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l1', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv13 = 5 scoring13 = 'r2' param_grid13 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7], "epsilon": [0.1, 0.2, 1]}] dict_method.update({'SGDR-L1': [me13, cv13, scoring13, param_grid13]}) """LinearSVR""" me14 = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, random_state=3, max_iter=1000) cv14 = 5 scoring14 = 'r2' param_grid14 = [{'C': [10, 6, 5, 3, 2.5, 1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.0, 0.1]}] dict_method.update({"LinearSVR-set": [me14, cv14, scoring14, param_grid14]}) return dict_method
from sklearn import neighbors, linear_model from graphic import census_2011, census_2015, land_area import numpy as np model_1 = linear_model.LinearRegression() model_2 = neighbors.KNeighborsRegressor(n_neighbors=2) # model learning this model_1 is learning use all data and second model use k-element neighbors that allows for us # get more accuracy data model_1.fit(np.c_[census_2011], np.c_[land_area]) model_2.fit(np.c_[census_2011], np.c_[land_area]) # in the end we can predict areas any regions(use their properties) print(model_1.predict([[100_000]])) print(model_2.predict([[100_000]]))
def knn_cv_search(X_train, y_train, list_neighbors=None, cv_parameter=5\ , scoring_parameter='accuracy', limit_list=(3,11) ): '''Search for best neighbours count for KNN classifier. Best number is provided from best MSE score over all cross-validations ''' #---------------------------------------------------------------------------- # Creation d'une liste de nombre de voisins impairs #---------------------------------------------------------------------------- if list_neighbors is None: myList = list(range(limit_list[0], limit_list[1])) filtered_neighbors = filter(lambda x: x % 2 != 0, myList) list_neighbors = list(filtered_neighbors) else: pass #---------------------------------------------------------------------------- # Liste contenant les scores moyens de la recherche croisée (CV) #---------------------------------------------------------------------------- list_cv_mean_scores = list() min_index = 0 scores_mean = 0.0 import time t0 = time.time() #---------------------------------------------------------------------------- # Search for best neighbors count over folds #---------------------------------------------------------------------------- for neighbor in list_neighbors: knn_clf = neighbors.KNeighborsRegressor(n_neighbors=neighbor) # knn_clf = KNeighborsClassifier(n_neighbors=neighbor) # ----------------------------------------------------------------------- # Get all scores over all cross validations folds # ----------------------------------------------------------------------- scores = cross_val_score(knn_clf\ ,X_train, y_train, cv=cv_parameter, scoring = scoring_parameter) # ----------------------------------------------------------------------- #Get mean of this scores for the given neighbor # ----------------------------------------------------------------------- list_cv_mean_scores.append(scores.mean()) print( "KNN classifier: Elapsed time for searching best neighbors number= %0.3fs" % (time.time() - t0)) #---------------------------------------------------------------------------- # Erreur de classification minimale #---------------------------------------------------------------------------- if scoring_parameter == 'accuracy' or scoring_parameter == 'r2': #------------------------------------------------------------------------- # Le meilleur score va a la valeur la plus proche de 1, signant ainsi # une plus grande précision. #------------------------------------------------------------------------- list_score = [1 - x for x in list_cv_mean_scores] else: #------------------------------------------------------------------------- # Le meilleur score va a la valeur la plus faible, signant une moindre # perte. #------------------------------------------------------------------------- list_score = list_cv_mean_scores min_index = list_score.index(min(list_score)) #---------------------------------------------------------------------------- # Extraction du meilleur nombre de voisins #---------------------------------------------------------------------------- best_neighbors = list_neighbors[min_index] print("Optimal number for neighbors= %d" % best_neighbors) return best_neighbors, list_neighbors, list_score
## 4.5 支持向量机 from sklearn.svm import SVC model = SVC(C=1.0, kernel=’rbf’, gamma=’auto’) """参数 --- C:误差项的惩罚参数C gamma: 核相关系数。浮点数,If gamma is ‘auto’ then 1/n_features will be used instead. """ ## 4.6 k近邻算法 KNN from sklearn import neighbors #定义kNN分类模型 model = neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=1) # 分类 model = neighbors.KNeighborsRegressor(n_neighbors=5, n_jobs=1) # 回归 """参数 --- n_neighbors: 使用邻居的数目 n_jobs:并行任务数 """ ## 4.7 多层感知机 from sklearn.neural_network import MLPClassifier # 定义多层感知机分类算法 model = MLPClassifier(activation='relu', solver='adam', alpha=0.0001) """参数 --- hidden_layer_sizes: 元祖 activation:激活函数 solver :优化算法{‘lbfgs’, ‘sgd’, ‘adam’}
x2_data = json.load(f) with open('x3_data.json') as f: x3_data = json.load(f) x1_norm = [(i - min(x1_data)) / (max(x1_data) - min(x1_data)) for i in x1_data] x2_norm = [(i - min(x2_data)) / (max(x2_data) - min(x2_data)) for i in x2_data] x3_norm = [(i - min(x3_data)) / (max(x3_data) - min(x3_data)) for i in x3_data] # create training data x_train = [] for i in range(len(x1_norm)): x_train.append([x1_norm[i], x2_norm[i], x3_norm[i]]) with open('y_data_k80.json') as f: y_train_K80 = json.load(f) model_K80 = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance') model_K80.fit(x_train, y_train_K80) with open('y_data_p100.json') as f: y_train_P100 = json.load(f) model_P100 = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance') model_P100.fit(x_train, y_train_P100) #################################################################### def send_signal(node, cmd): # Create a TCP/IP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) port = 10000 # Connect the socket to the port where the server is listening
features = ['Feature 1','Feature 2','Feature 3','Feature 4','Feature 5 (meaningless but please still use it)', 'Feature 6','Feature 7','Feature 8','Feature 9','Feature 10'] le = preprocessing.LabelEncoder() df = pd.read_csv("/Users/markloughman/Desktop/Machine Learning/DATA/TheSumDataSetWithNoise",sep=";",nrows = 10000) catnum = df["Noisy Target Class"].tolist() X = df.loc[:,features] y = df["Noisy Target"] n_neighbors = 5 for i, weights in enumerate(['uniform', 'distance']): lr = neighbors.KNeighborsRegressor(5, weights=weights) #Needed to avoid members of target class being less that the number of folds NMSE_results = cross_val_score(lr, X, y, cv=10, scoring="neg_mean_squared_error") # Choose another regression metric NMSE_results = NMSE_results * -1 RMS_results = np.sqrt(NMSE_results) mean_error = RMS_results.mean() abs_mean_error = cross_val_score(lr, X, y, cv=10, scoring="neg_mean_absolute_error") abs_mean_error = abs_mean_error * -1 abs_mean_error = abs_mean_error.mean()
print "Out-of-sample variance: %0.3f" % numpy.var(out_sample_errors) print "Out-of-sample mean: %0.3f" % numpy.mean(out_sample_errors) return (numpy.mean(out_sample_errors) + numpy.mean(in_sample_errors)) / 2 if __name__ == '__main__': dataset = utils.dict_to_numpy( utils.read_data_from_csv('data/winequality-red.csv'), columns_to_exclude = ['fixed acidity', 'chlorides', 'free sulfur dioxide']) data = dataset['data'] target = dataset['target'] attributes = dataset['attributes'] X_train = data[:-100] X_test = data[-100:] Y_train = target[:-100] Y_test = target[-100:] print 'Linear regression' regression_model = linear_model.LinearRegression() regression(regression_model, X_train, X_test, Y_train, Y_test, 'linear') print for i in range(1, 9): print 'kNn regression for %s neighbors' % i regression_model = neighbors.KNeighborsRegressor(i) print 'Avg error %0.4f' % regression(regression_model, X_train, X_test, Y_train, Y_test, 'knn_%s' % i) print
# plt.show() model = MLPRegressor() predict_y = model.fit(train_X, train_gpa_y).predict(test_X) m = mean_squared_error(test_gpa_y, predict_y) print("MLP:%f" % m) plt.scatter(test_gpa_y, predict_y) plt.show() model = tree.DecisionTreeRegressor() predict_y = model.fit(train_X, train_gpa_y).predict(test_X) m = mean_squared_error(test_gpa_y, predict_y) print("决策树:%f" % m) model = neighbors.KNeighborsRegressor() predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("KNN:%f" % mse) model = ensemble.RandomForestRegressor(n_estimators=20, random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("随机森林:%f" % mse) model = ensemble.GradientBoostingRegressor(n_estimators=100, random_state=1) predict_y = model.fit(train_X, train_gpa_y).predict(test_X) mse = mean_squared_error(test_gpa_y, predict_y) print("GBRT:%f" % mse) model = ensemble.BaggingRegressor(random_state=1)
#%% 7. KNN of reputation. ##knn model on predict reputation. ##variables: skill scores xfifa=fifa.iloc[:,53:87] yfifa=fifa['International_Reputation'] xsfifa = pd.DataFrame( scale(xfifa), columns=xfifa.columns ) ysfifa = yfifa.copy() X_train, X_test, y_train, y_test = train_test_split(xsfifa, ysfifa, test_size = 0.25, random_state=2019) #%% rmse_val = [] for K in range(15): K=K+1 model = neighbors.KNeighborsRegressor(n_neighbors = K) model.fit(X_train, y_train) pred=model.predict(X_test) error = sqrt(mean_squared_error(y_test,pred)) rmse_val.append(error) print('RMSE value for k= ' , K , 'is:', error) #%% curve = pd.DataFrame(rmse_val) curve.plot() #%%
def dict_method_reg(): dict_method = {} # 1st part """4KNR""" me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski') cv4 = 5 scoring4 = 'r2' param_grid4 = [{'n_neighbors': [4, 5, 6, 7, 8], "leaf_size": [10, 20, 30]}] dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]}) """1SVR""" me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=True, max_iter=2000) cv1 = 5 scoring1 = 'r2' param_grid1 = [{'C': [10, 1, 0.1, 0.01, 0.001], 'kernel': ker}] dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]}) """5kernelridge""" me5 = kernel_ridge.KernelRidge(alpha=1, gamma="scale", degree=3, coef0=-1, kernel_params=None) cv5 = 5 scoring5 = 'r2' param_grid5 = [{'alpha': [10, 1, 0.1, 0.001], 'kernel': ker}] dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]}) """6GPR""" me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=0) cv6 = 5 scoring6 = 'r2' param_grid6 = [{'kernel': ker}] dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]}) # 2nd part """6RFR""" me7 = RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False) cv7 = 5 scoring7 = 'r2' param_grid7 = [{ 'max_depth': [3, 4, 5], }] dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]}) """7GBR""" me8 = GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, ) cv8 = 5 scoring8 = 'r2' param_grid8 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]}) "AdaBR" dt = DecisionTreeRegressor(criterion="mse", splitter="best", max_features=None, max_depth=5, min_samples_split=4) me9 = AdaBoostRegressor(dt, n_estimators=200, learning_rate=0.05, loss='linear', random_state=0) cv9 = 5 scoring9 = 'explained_variance' param_grid9 = [{'n_estimators': [100, 200]}] dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]}) '''DTR''' me10 = DecisionTreeRegressor( criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, ) cv10 = 5 scoring10 = 'r2' param_grid10 = [{ 'max_depth': [2, 3, 4, 5, 6, 7], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2] }] dict_method.update({'DTR-em': [me10, cv10, scoring10, param_grid10]}) 'ElasticNet' me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None) cv11 = 5 scoring11 = 'r2' param_grid11 = [{ 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'l1_ratio': [0.3, 0.5, 0.8] }] dict_method.update({"EN-L1": [me11, cv11, scoring11, param_grid11]}) 'Lasso' me12 = Lasso( alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.001, warm_start=False, positive=False, random_state=None, ) cv12 = 5 scoring12 = 'r2' param_grid12 = [ { 'alpha': [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000 ] }, ] dict_method.update({"LASSO-L1": [me12, cv12, scoring12, param_grid12]}) """2BayesianRidge""" me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.01, verbose=False) cv2 = 5 scoring2 = 'r2' param_grid2 = [{ 'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-06, 1e-05] }] dict_method.update({'BRR-L1': [me2, cv2, scoring2, param_grid2]}) """3SGDRL2""" me3 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l2', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv3 = 5 scoring3 = 'r2' param_grid3 = [{ 'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05], 'loss': ['squared_loss', "huber"], "penalty": ["l1", "l2"] }] dict_method.update({'SGDR-L1': [me3, cv3, scoring3, param_grid3]}) """PassiveAggressiveRegressor""" me14 = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=0.1, random_state=None, warm_start=False, average=False) cv14 = 5 scoring14 = 'r2' param_grid14 = [{ 'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01] }] dict_method.update({'PAR-L1': [me14, cv14, scoring14, param_grid14]}) return dict_method
import numpy as np from sklearn.ensemble import RandomForestRegressor as rf from sklearn.preprocessing import MinMaxScaler from sklearn import neighbors if __name__ == '__main__': # Load data df = pd.read_csv('dataset oversampling.csv') X = df.drop(['Mn', 'MWD'], axis=1) Y1 = df['Mn'] Y2 = df['MWD'] # Train the dataset with optimized models knn = neighbors.KNeighborsRegressor(n_neighbors=1, weights='uniform') rfr = rf(max_depth=6, max_features='sqrt', min_samples_split=2, n_estimators=200) min_max_scaler = MinMaxScaler() X_nor = min_max_scaler.fit_transform(X) knn.fit(X_nor, Y1) rfr.fit(X, Y2) # Generate the combinatorial condition pool conditions = [[ M, M_CTA_1 * M_CTA_2, PC_M_1 * PC_M_2, 1, 0, 0, 0, 0, 0, 0, time ] for M in np.arange(0.1, 8, step=0.1) for M_CTA_1 in np.arange(1, 9, step=1)
pc_job = [] K80_node = 'c2180' V100_node = 'd1024' host_node = 'c0168' testcase = args.tc ### also, change .h5 file folder in jobs ### INTERVAL = 30 # make decision every 30s ######################### do a regression fit ######################## with open('x_data.json') as f: x_train = json.load(f) with open('y_data.json') as f: y_train = json.load(f) model = neighbors.KNeighborsRegressor(n_neighbors=1, weights='distance') model.fit(x_train, y_train) #################################################################### def send_signal(node, cmd): # Create a TCP/IP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) port = 10000 if node == K80_node else 10001 # Connect the socket to the port where the server is listening server_address = (node, int(port)) print('connecting to {} port {}'.format(*server_address)) sock.connect(server_address)
def plot(): map_path = file_path + "/resources/sf_block_groups/sf_block_groups_nowater.geojson" coc_path = file_path + "/resources/sf_block_groups/coc" plot_path = file_path + "/resources/sf_data/sf_overspace_plot_data.json" fig_path = file_path + "/results/sf_change_overspace.pdf" # Read data. with open(plot_path, "r") as plot_file: data = json.loads(plot_file.read().strip("\n")) coc = gpd.read_file(coc_path) coc = coc[coc["GEOID"].astype("int") - coc["GEOID"].astype("int") % 1000000 == 6075000000] coc = coc[coc["GEOID"].astype("int") != 6075017902] coc = coc[coc["COCFLAG__1"] == 1] coc = coc.to_crs({"init": "epsg:4326"}) map = gpd.read_file(map_path) map["geoid"] = map["stfid"].astype("int") map = map[["geoid", "geometry"]] map["bg_lng"] = map.centroid.apply(lambda p: p.x) map["bg_lat"] = map.centroid.apply(lambda p: p.y) map = map[map["geoid"] != 60750179021] # Get supply curve data sup = pd.DataFrame.from_dict(data["sup"]) sup["geoid"] = data["index"] sup = sup[sup["geoid"] != 60750601001] sup = sup[sup["geoid"] != 60750604001] sup = sup[sup["geoid"] != 60750332011] sup = sup[sup["geoid"] != 60750610002] sup = sup[sup["geoid"] != 60750264022] sup = sup[sup["geoid"] != 60750258002] sup[sup["geoid"] == 60750610001] = 1 sup = map.merge(sup, on = "geoid", how = "left") # Get price curve data pri = pd.DataFrame.from_dict(data["pri"]) pri["geoid"] = data["index"] pri = map.merge(pri, on = "geoid", how = "left") # Plot parameter and setting. font = FontProperties() font.set_weight("bold") font.set_size(10) matplotlib.rcParams.update({"font.size": 6}) alpha = 0.5 alpha2 = 0.3 k = 2 bar_cons = 0.66 bar_mv = 0.27 for i in [0, 1, 2, 3, 4]: ax[i].set_xlim([-122.513, -122.355]) ax[i].set_ylim([37.707, 37.833]) ax[i].set_axis_off() ax[i].xaxis.set_major_locator(plt.NullLocator()) ax[i].yaxis.set_major_locator(plt.NullLocator()) coc.plot(ax = ax[i], linewidth = 0.5, alpha = 0) app_list = ["uber", "lyft", "taxi"] cmap = "RdYlGn" f = 0 for i in [0, 1, 2]: sup["plot"] = sup[app_list[i]] #/ sup["area"] * 581 knn = neighbors.KNeighborsRegressor(k, "distance") # Fill empty area. train_x = sup[["plot", "bg_lat", "bg_lng"]].dropna()[["bg_lat", "bg_lng"]].values train_y = sup["plot"].dropna().values predict_x = sup[["bg_lat", "bg_lng"]].values sup["plot"] = knn.fit(train_x, train_y).predict(predict_x) vmin = sup["plot"].min() vmax = sup["plot"].quantile(0.95) # plot sup.plot(ax = ax[i], linewidth = 0, column = "plot", cmap = cmap, alpha = alpha, k = 10, vmin = vmin, vmax = vmax) ax[i].set_title(upperfirst(app_list[i]) + " Supply", fontproperties = font) fig = ax[i].get_figure() cax = fig.add_axes([0.128 + 0.16 * i, 0.07, 0.12, 0.02]) sm = plt.cm.ScalarMappable(cmap = cmap, norm = plt.Normalize(vmin = vmin, vmax = vmax)) sm._A = [] fig.colorbar(sm, cax = cax, alpha = alpha2, extend = "both", orientation = "horizontal") cmap = "RdYlGn_r" f = 2 for i in [3, 4]: pri["plot"] = (pri[app_list[i - 3]] - 1) * 100 knn = neighbors.KNeighborsRegressor(k, "distance") # Fill empty area. train_x = pri[["plot", "bg_lat", "bg_lng"]].dropna()[["bg_lat", "bg_lng"]].values train_y = pri["plot"].dropna().values predict_x = pri[["bg_lat", "bg_lng"]].values pri["plot"] = knn.fit(train_x, train_y).predict(predict_x) vmin = 0 vmax = 12 # plot pri.plot(ax = ax[i], linewidth = 0, column = "plot", cmap = cmap, alpha = alpha, k = 10, vmin = vmin, vmax = vmax) ax[i].set_title(upperfirst(app_list[i - 3]) + " Price", fontproperties = font) fig = ax[i].get_figure() cax = fig.add_axes([0.128 + 0.16 * i, 0.07, 0.12, 0.02]) sm = plt.cm.ScalarMappable(cmap = cmap, norm = plt.Normalize(vmin = vmin, vmax = vmax)) sm._A = [] fig.colorbar(sm, cax = cax, alpha = alpha2, extend = "both", orientation = "horizontal") map_path = file_path + "/resources/nyc_block_groups/nyc_bg_with_data_acs15.geojson" plot_path = file_path + "/resources/nyc_data/nyc_overspace_plot_data.json" fig_path = file_path + "/results/nyc_change_overspace.pdf" # Read data. with open(plot_path, "r") as plot_file: data = json.loads(plot_file.read().strip("\n")) map = gpd.read_file(map_path) coc = map.sort_values("income")[:80] map = map[map["population"].astype("float") > 10.0] map["geoid"] = map["geo_id"].astype("int") map = map[["geoid", "geometry"]] map["bg_lng"] = map.centroid.apply(lambda p: p.x) map["bg_lat"] = map.centroid.apply(lambda p: p.y) # Get supply curve data sup = pd.DataFrame.from_dict(data["sup"]) sup["geoid"] = data["index"] sup = map.merge(sup, on = "geoid", how = "left") # Get price curve data pri = pd.DataFrame.from_dict(data["pri"]) pri["geoid"] = data["index"] pri = pri[pri["uber"] > 1.0] pri = pri[pri["lyft"] > 1.0] pri = map.merge(pri, on = "geoid", how = "left")
'''最近邻回归 KNN算法不仅可以用于分类,还可以用于回归。通过找出一个样本的k个最近邻居, 将这些邻居的某个(些)属性的平均值赋给该样本,就可以得到该样本对应属性的值。 ''' np.random.seed(0) X = np.sort(5 * np.random.rand(40, 1), axis=0) T = np.linspace(0, 5, 500)[:, np.newaxis] y = np.sin(X).ravel() y[::5] += 1 * (0.5 - np.random.rand(8)) #每隔5个,y上面增加噪音 n_neighbors = 5 for i, weights in enumerate(['uniform', 'distance']): knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) y_ = knn.fit(X, y).predict(T) plt.subplot(2, 1, i + 1) plt.scatter(X, y, color='darkorange', label='data') plt.plot(T, y_, color='navy', label='prediction') plt.axis('tight') plt.legend() plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, weights)) plt.tight_layout() plt.show() ''' 邻域成分分析NCA,NeighborhoodComponentsAnalysis,其目的是提高最近邻分类相对于标准欧氏距离的准确性。该算法直接最大化训练集上k近邻(KNN)得分的随机变量,还可以拟合数据的低维线性投影 可以自然地处理多类问题,而不需要增加模型的大小,并且不引入需要用户进行微调的额外参数。
def build_surrogate(self): """ Build a surrogate. Multiple options for models are available including: -Gaussian Processes -KNN -SVR Assumptions: None Source: N/A Inputs: state [state()] Outputs: self.sfc_surrogate [fun()] self.thrust_surrogate [fun()] Properties Used: Defaulted values """ # unpack pycycle_problem = self.model pycycle_problem.set_solver_print(level=-1) pycycle_problem.set_solver_print(level=2, depth=0) # Extract the data # Create lists that will turn into arrays Altitudes = [] Machs = [] PCs = [] Thrust = [] TSFC = [] # if we added fc.dTS this would handle the deltaISA throttles = self.evaluation_throttles*1. for MN, alt in self.evaluation_mach_alt: print('***'*10) print(f'* MN: {MN}, alt: {alt}') print('***'*10) pycycle_problem['OD_full_pwr.fc.MN'] = MN pycycle_problem['OD_full_pwr.fc.alt'] = alt pycycle_problem['OD_part_pwr.fc.MN'] = MN pycycle_problem['OD_part_pwr.fc.alt'] = alt for PC in throttles: print(f'## PC = {PC}') pycycle_problem['OD_part_pwr.PC'] = PC pycycle_problem.run_model() #Save to our list for SUAVE Altitudes.append(alt) Machs.append(MN) PCs.append(PC) TSFC.append(pycycle_problem['OD_part_pwr.perf.TSFC'][0]) Thrust.append(pycycle_problem['OD_part_pwr.perf.Fn'][0]) throttles = np.flip(throttles) # Now setup into vectors Altitudes = np.atleast_2d(np.array(Altitudes)).T * Units.feet Mach = np.atleast_2d(np.array(Machs)).T Throttle = np.atleast_2d(np.array(PCs)).T thr = np.atleast_2d(np.array(Thrust)).T * Units.lbf sfc = np.atleast_2d(np.array(TSFC)).T * Units['lbm/hr/lbf'] # lbm/hr/lbf converted to (kg/N/s) # Once we have the data the model must be deleted because pycycle models can't be deepcopied self.pop('model') # Concatenate all together and things will start to look like the propuslor surrogate soon my_data = np.concatenate([Altitudes,Mach,Throttle,thr,sfc],axis=1) if self.save_deck : # Write an engine deck np.savetxt("pyCycle_deck.csv", my_data, delimiter=",") print(my_data) # Clean up to remove redundant lines b = np.ascontiguousarray(my_data).view(np.dtype((np.void, my_data.dtype.itemsize * my_data.shape[1]))) _, idx = np.unique(b, return_index=True) my_data = my_data[idx] xy = my_data[:,:3] # Altitude, Mach, Throttle thr = np.transpose(np.atleast_2d(my_data[:,3])) # Thrust sfc = np.transpose(np.atleast_2d(my_data[:,4])) # SFC self.altitude_input_scale = np.max(xy[:,0]) self.thrust_input_scale = np.max(thr) self.sfc_input_scale = np.max(sfc) # normalize for better surrogate performance xy[:,0] /= self.altitude_input_scale thr /= self.thrust_input_scale sfc /= self.sfc_input_scale # Pick the type of process if self.surrogate_type == 'gaussian': gp_kernel = Matern() regr_sfc = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel) regr_thr = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel) thr_surrogate = regr_thr.fit(xy, thr) sfc_surrogate = regr_sfc.fit(xy, sfc) elif self.surrogate_type == 'knn': regr_sfc = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance') regr_thr = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance') sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) elif self.surrogate_type == 'svr': regr_thr = svm.SVR(C=500.) regr_sfc = svm.SVR(C=500.) sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) elif self.surrogate_type == 'linear': regr_thr = linear_model.LinearRegression() regr_sfc = linear_model.LinearRegression() sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) else: raise NotImplementedError('Selected surrogate method has not been implemented') if self.thrust_anchor is not None: cons = deepcopy(self.thrust_anchor_conditions) cons[0,0] /= self.altitude_input_scale base_thrust_at_anchor = thr_surrogate.predict(cons) self.thrust_anchor_scale = self.thrust_anchor/(base_thrust_at_anchor*self.thrust_input_scale) if self.sfc_anchor is not None: cons = deepcopy(self.sfc_anchor_conditions) cons[0,0] /= self.altitude_input_scale base_sfc_at_anchor = sfc_surrogate.predict(cons) self.sfc_anchor_scale = self.sfc_anchor/(base_sfc_at_anchor*self.sfc_input_scale) # Save the output self.sfc_surrogate = sfc_surrogate self.thrust_surrogate = thr_surrogate