def _check_coords_for_distance_weighting(self, coords, check_radius, check_weights, X, y_mean): """ Checks that coords won't break the distance weighting function """ valid_inds = [] for coord in xrange(len(coords)): temp = RadiusNeighborsRegressor(radius=check_radius, weights=check_weights) temp.fit(X, y_mean) try: temp.predict([coords[coord]]) valid_inds.append(coord) except ZeroDivisionError: continue return valid_inds
def get_author_list_with_pruning_method(feature_list, author_list, qp, radius): """ feature_list - the feature list to indicate the stylometric features author_list - the author list to indicate a paragraph is written by whom qp - the query point, mostly represents a document This function will return a shortened author list, which can greatly reduce the size of training set by removing those data points too far from the query point. Since it takes time to calculate the Hausdorff distance, reducing the size of testing set can speed up the process Please refer to the following link for more information http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html#sklearn.neighbors.RadiusNeighborsRegressor """ neigh = RadiusNeighborsRegressor(radius=radius, algorithm='brute', p=2) neigh.fit(feature_list, author_list) return neigh.radius_neighbors(qp, return_distance=True)
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ algorithm = RadiusNeighborsRegressor( radius=get_ohe_config().rnr_radius) algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def compare_multiple_stacks(folder): subfolders = os.listdir(folder) all_data = [] for subfolder in tqdm.tqdm(subfolders): all_data.append(load_images(os.path.join(folder, subfolder))) all_data = np.array(all_data) print(all_data.shape) for channel in range(3): for subfolder_index in range(all_data.shape[0]): channel_stack = all_data[subfolder_index][:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 skip = 1 flat_ratios = img_sigma_ratio.flatten()[::skip] mean_values = img_mean.flatten()[::skip] # plt.scatter(mean_values, flat_ratios, alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='uniform') rnr.fit(np.expand_dims(mean_values, axis=1), flat_ratios.flatten()) x = np.arange( np.min(mean_values) + 200, np.max(mean_values) + 1 - 200, 10) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, label=str(subfolder_index)) plt.legend() plt.grid(True) plt.show()
def compare_error_vs_brightness(folder): data = load_images(folder) for channel in range(data.shape[3]): channel_stack = data[:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 x = np.arange(np.min(img_mean), np.max(img_mean) + 1) bit_flip_change = 128 if channel == 1 else 256 y_top = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x - bit_flip_change) - 1) * 1E3 y_bottom = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x + bit_flip_change) - 1) * 1E3 plt.plot(x, y_top, 'r') plt.plot(x, y_bottom, 'r') plt.scatter(img_mean.flatten(), img_sigma_ratio.flatten(), alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='distance') rnr.fit(np.expand_dims(img_mean.flatten(), axis=1), img_sigma_ratio.flatten()) x = np.arange(np.min(img_mean), np.max(img_mean) + 1) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, 'g') plt.grid(True) plt.show()
def grid_points_2d(mesh, cell_size=10): grid = vtk_Voxel.from_mesh(mesh, cell_size, 2) cells = grid.cell_centers().points radius = cell_size * 0.5 tmat = np.full(cells.shape[0], np.nan) print("sample min", np.min(mesh.points[:, 2]), "max", np.max(mesh.points[:, 2])) while np.any(np.isnan(tmat)): # keep increasing radius until all cells have values radius *= 1.5 print("RadiusNeighborsRegressor =", radius, "m") neigh = RadiusNeighborsRegressor(radius, 'distance') neigh.fit(mesh.points[:, :2], mesh.points[:, 2]) rmat = neigh.predict(cells[:, :2]) np.putmask(tmat, np.isnan(tmat), rmat) print("regression min", np.min(tmat), "max", np.max(tmat)) grid.cell_arrays['Elevation'] = tmat surf = grid.extract_surface() surf = surf.ctp() surf.points[:, 2] = surf.point_arrays['Elevation'] return surf
def powerproduction(): if fl.request.method == "POST": speed = {} speed = float(fl.request.form['speed']) # speed = requests.get(data['input_s']) # import csv data and convert to pandas dataframe df = pd.read_csv("powerproduction.csv") # remove all zeros df = df[df.power != 0] # put rows in order of speed df = df.sort_values('speed') # set each column to a numpy array for processing S = df['speed'].to_numpy() p = df['power'].to_numpy() neigh_radius = RadiusNeighborsRegressor(radius=1.7, weights='distance', p = 2) neigh_radius.fit(S.reshape(-1, 1), p) p_pred = neigh_radius.predict([[speed]]) return {'value': p_pred[0]}
def process_data(input_true, input_reco, segment_label, group_label): """ arguments are Nx5 from processing data input_true: energy depositions input_reco: charge depositions segment_label: fivetypes label group_label: particle instance purpose is to get find M non-ghost reco voxels and set target energies for them based on blurring returns tuple of neural network inputs and other useful stuff (it's messy, sorry) element 0: [size Mx12] corresponding to input_reco (5) + one-hot encoded fivetypes+ghost (6) + blurred energy target (1) element 1: [size M] group label of voxel element 2: [size M] indices in input_true of voxels that have been reconstructed element 3: [size Mx5] input_true intersection with reco, where the last element in each row is blurred energy """ chosen_indices = [] chosen_reco_indices = [] current_batch = 0 current_batch_selection = np.where(input_true[:, -2] == current_batch)[0] current_input_true = input_true[current_batch_selection] for r in range(len(input_reco)): row = input_reco[r] b = row[-2] if b != current_batch: current_batch = b current_batch_selection = np.where( input_true[:, -2] == current_batch)[0] pos = row[:3] region_selection = np.where((current_input_true[:, 0] == pos[0]) & (current_input_true[:, 1] == pos[1]))[0] input_true_region = current_input_true[region_selection] for i in range(len(input_true_region)): row2 = input_true_region[i] pos2 = row2[:3] if np.array_equal(pos, pos2): chosen_indices.append( current_batch_selection[region_selection[i]]) chosen_reco_indices.append(r) break if len(chosen_indices) == 0: return None chosen_indices = np.array(chosen_indices) chosen_reco_indices = np.array(chosen_reco_indices) lost_data = np.delete(input_true, chosen_indices, axis=0) found_data = input_true[chosen_indices] # find where the chosen indices are in the group data lost_group_data = -np.ones((len(lost_data), len(lost_data[0]))) ungrouped_data = -np.ones((len(lost_data), len(lost_data[0]))) found_group_data = -np.ones((len(found_data), len(found_data[0]))) for i in range(len(lost_data)): row = lost_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] if len(filter3) == 0: ungrouped_data[i] = row else: g = filter3[0] lost_group_data[i] = g for i in range(len(found_data)): row = found_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] g = filter3[0] found_group_data[i] = g if ADD_MISSING_ENERGY: batches = np.unique(input_true[:, 3]) for b in batches: # nearest neighbor assignment within group found_groups = np.unique( found_group_data[np.where(found_group_data[:, 3] == b)][:, -1]) lost_batch_mask = lost_group_data[:, 3] == b found_batch_mask = found_group_data[:, 3] == b for g in found_groups: lost_selection = np.where(lost_batch_mask & (lost_group_data[:, -1] == g))[0] found_selection = np.where(found_batch_mask & (found_group_data[:, -1] == g))[0] ldata = lost_data[lost_selection] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] # associated ungrouped voxels with nearest voxels, regardless of group lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0] if len(lost_ungrouped) > 0: found_selection = np.where(found_batch_mask)[0] ldata = lost_data[lost_ungrouped] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] if BLUR_ENERGY: blur_kernel = 3 for g in np.unique(found_group_data[:, -1]): inds = np.where(found_group_data[:, -1] == g) selection = found_data[inds] total_energy = np.sum(selection[:, -1]) coords = selection[:, :3] energies = selection[:, -1] neigh = RadiusNeighborsRegressor(radius=blur_kernel) neigh.fit(coords, energies) selection[:, -1] = neigh.predict(coords) selection[:, -1] *= total_energy / np.sum(selection[:, -1]) found_data[inds, -1] = selection[:, -1] segment_indices = segment_label[chosen_indices, -1].astype(int) segment_one_hot = np.zeros((len(segment_indices), 5)) segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1 out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot, np.expand_dims(found_data[:, -1], axis=1)), axis=1) return np.array(out), found_group_data[:, -1], chosen_indices, found_data
ax.set_xticks(list(ax.get_xticks()) + [best_radius]) ax.plot(radii, mae_rnn, c='orange', linewidth=2) fig.savefig('rnn_param.png') return best_radius knn_regressor = KNeighborsRegressor(n_neighbors=get_best_knn_n_neighbors( 1, 100), weights='distance') knn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) rnn_regressor = RadiusNeighborsRegressor(radius=get_best_rnn_radius( 1.7, 3.0, 0.05), weights='distance') rnn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) lr_regressor = LinearRegression() lr_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) energia_knn = knn_regressor.predict(test_df[['temperatura', 'vacuo']]) energia_rnn = rnn_regressor.predict(test_df[['temperatura', 'vacuo']]) energia_lr = lr_regressor.predict(test_df[['temperatura', 'vacuo']]) fig, ax = plt.subplots() ax.set_title('Evaluation of regression algorithms') ax.set_ylabel('Mean absolute error') ax.set_ylim(0, 5) ax.set_yticks(np.arange(0, 5, 1.5)) rects = ax.bar(x=['kNN', 'rNN', 'LR'],
print(code_optimisation()) ############################## # Building the model # ++++++++++++++++++ filename = "onnx_to_profile.onnx" if not os.path.exists(filename): print(f"Generate a graph for {filename!r}.") X = numpy.random.randn(1000, 10).astype(numpy.float64) y = X.sum(axis=1).reshape((-1, 1)) model = RadiusNeighborsRegressor() model.fit(X, y) onx = to_onnx(model, X, options={'optim': 'cdist'}) with open(filename, "wb") as f: f.write(onx.SerializeToString()) ##################################### # Functions # +++++++++ # # We need to generate random inputs to test the graph. def random_input(typ, shape, batch): if typ == 'tensor(double)': dtype = numpy.float64
y_pred = np.zeros(len(y), dtype=y.dtype) # where we'll accumulate predictions clf = RadiusNeighborsRegressor(radius=15) # CV Loop for train_index, test_index in kf: # for each iteration of the for loop we'll do a test train split X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] t = StandardScaler() X_train = t.fit_transform(X_train) clf.fit(X_train, y_train) # Train clf_1 on the training data X_test = t.transform(X_test) y_pred[test_index] = clf.predict(X_test) # Predict clf_1 using the test and store in y_pred r2_score(y, y_pred) rmse = sqrt(mean_squared_error(y, y_pred)) print "RadiusNeighborsRegressor RMSE: " , rmse ### Prediction ### result = clf.predict(test_feats) result = np.asarray(result) np.savetxt("result.csv", result, delimiter=",")
def process_data(data): """ data: input_true, input_reco, ghost_label, group_label returns: input, output input: intersection between reco and true, labeled with reco charge depositions output: intersection between reco and true, labeled with adjusted energy depositions """ input_true = data['input_true'] input_reco = data['input_reco'] segment_label = data['segment_label'] group_label = data['group_label'] chosen_indices = [] chosen_reco_indices = [] current_batch = 0 current_batch_selection = np.where(input_true[:, -2] == current_batch)[0] current_input_true = input_true[current_batch_selection] for r in range(len(input_reco)): row = input_reco[r] b = row[-2] if b != current_batch: current_batch = b current_batch_selection = np.where( input_true[:, -2] == current_batch)[0] pos = row[:3] region_selection = np.where((current_input_true[:, 0] == pos[0]) & (current_input_true[:, 1] == pos[1]))[0] input_true_region = current_input_true[region_selection] for i in range(len(input_true_region)): row2 = input_true_region[i] pos2 = row2[:3] if np.array_equal(pos, pos2): chosen_indices.append( current_batch_selection[region_selection[i]]) chosen_reco_indices.append(r) break if len(chosen_indices) == 0: return None chosen_indices = np.array(chosen_indices) chosen_reco_indices = np.array(chosen_reco_indices) lost_data = np.delete(input_true, chosen_indices, axis=0) found_data = input_true[chosen_indices] # find where the chosen indices are in the group data lost_group_data = -np.ones((len(lost_data), len(lost_data[0]))) ungrouped_data = -np.ones((len(lost_data), len(lost_data[0]))) found_group_data = -np.ones((len(found_data), len(found_data[0]))) for i in range(len(lost_data)): row = lost_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] if len(filter3) == 0: ungrouped_data[i] = row else: g = filter3[0] lost_group_data[i] = g for i in range(len(found_data)): row = found_data[i] filter0 = group_label[np.where(group_label[:, -2] == row[-2])] filter1 = filter0[np.where(filter0[:, 0] == row[0])] filter2 = filter1[np.where(filter1[:, 1] == row[1])] filter3 = filter2[np.where(filter2[:, 2] == row[2])] g = filter3[0] found_group_data[i] = g # lost_group_data = np.delete(group_label, chosen_indices, axis=0) # found_group_data = group_label[chosen_indices] if ADD_MISSING_ENERGY: batches = np.unique(input_true[:, 3]) for b in batches: # nearest neighbor assignment within group found_groups = np.unique( found_group_data[np.where(found_group_data[:, 3] == b)][:, -1]) lost_batch_mask = lost_group_data[:, 3] == b found_batch_mask = found_group_data[:, 3] == b for g in found_groups: lost_selection = np.where(lost_batch_mask & (lost_group_data[:, -1] == g))[0] found_selection = np.where(found_batch_mask & (found_group_data[:, -1] == g))[0] ldata = lost_data[lost_selection] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] # associated ungrouped voxels with nearest voxels, regardless of group lost_ungrouped = np.where((ungrouped_data[:, 3] == b))[0] if len(lost_ungrouped) > 0: found_selection = np.where(found_batch_mask)[0] ldata = lost_data[lost_ungrouped] fdata = found_data[found_selection] lost_positions = ldata[:, :3] found_positions = fdata[:, :3] distances = distance_matrix(lost_positions, found_positions) closest_points = np.argmin(distances, axis=1) closest_energies = ldata[:, -1] for i in range(len(closest_points)): found_data[found_selection[ closest_points[i]]][-1] += closest_energies[i] if BLUR_ENERGY: blur_kernel = 3 for g in np.unique(found_group_data[:, -1]): inds = np.where(found_group_data[:, -1] == g) selection = found_data[inds] total_energy = np.sum(selection[:, -1]) coords = selection[:, :3] energies = selection[:, -1] neigh = RadiusNeighborsRegressor(radius=blur_kernel) neigh.fit(coords, energies) selection[:, -1] = neigh.predict(coords) selection[:, -1] *= total_energy / np.sum(selection[:, -1]) found_data[inds, -1] = selection[:, -1] segment_indices = segment_label[chosen_indices, -1].astype(int) segment_one_hot = np.zeros((len(segment_indices), 5)) segment_one_hot[np.arange(len(segment_indices)), segment_indices] = 1 out = np.concatenate((input_reco[chosen_reco_indices], segment_one_hot, np.expand_dims(found_data[:, -1], axis=1)), axis=1) return np.array(out), found_group_data[:, -1]
from sklearn.neighbors import KNeighborsRegressor KNN_reg = KNeighborsRegressor(n_neighbors=6, weights='uniform') KNN_reg.fit(x_train, y_train) y_predict_knn = KNN_reg.predict(x_test) y_predict_knn[0:10] from sklearn.neighbors import RadiusNeighborsRegressor RNN_reg = RadiusNeighborsRegressor(radius=x_train.std()) RNN_reg.fit(x_train, y_train) y_predict_rnn = RNN_reg.predict(x_test) y_predict_rnn[0:10] RNN_reg = RadiusNeighborsRegressor() RNN_reg.fit(x_train, y_train) RNN_reg.predict(x_test) from sklearn.metrics import mean_absolute_error, mean_squared_error mean_absolute_error(y_test, y_predict_knn)
# reg = GradientBoostingRegressor() # reg = HistGradientBoostingRegressor() # kernel = DotProduct() + WhiteKernel() # reg = GaussianProcessRegressor(kernel=kernel, random_state=0) #awful # reg = LogisticRegression() # reg = Ridge(alpha=1.0) # not good # reg = BayesianRidge() # not good # reg = PoissonRegressor() # not good # reg = TweedieRegressor() # not good # reg = GammaRegressor() # not good # reg = MLPRegressor(random_state=0, max_iter=500) # not good # reg = DecisionTreeRegressor() # not too great # reg = KNeighborsRegressor(n_neighbors=5, algorithm="auto", weights="uniform", leaf_size=30) reg = RadiusNeighborsRegressor(radius=4.3) # reg = SVR(C=60, gamma='auto') # print(cross_val_score(reg, X, y, cv=10)) reg.fit(X_train, y_train) # accuracy = reg.score(X_test, y_test) # print(accuracy) predictions = reg.predict(X) df['Prediction'] = predictions df = df.loc["2015-03-13"] df['Total_Feeder'].plot() df['Prediction'].plot() plt.show()
print "Intercept: ", lin3.intercept_ for k, v in enumerate(lin3.coef_[0]): print threeYrXcol[k], ": ", v # KNeighborsRegressor kn3 = KNReg(weights='uniform') #kn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) kn3.fit(X_train, y_train) print "Train: ", kn3.score(X_train, y_train) print "Test: ", kn3.score(X_test, y_test) # print kn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # RadiusNeighborsRegressor rn3 = RNReg(radius=7.0) #rn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) rn3.fit(X_train, y_train) print "Train: ", rn3.score(X_train, y_train) print "Test: ", rn3.score(X_test, y_test) print rn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # Test 2010/11/12 stats and 2013 projections against 2013 actuals y=2013 y3 = [y-1,y-2,y-3] tms_include = np.intersect1d(df[df.Year == y3[0]].Team.values, df[df.Year == y3[2]].Team.values) df2012 = pd.merge(df[(df.Year.isin(y3)) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), df[(df.Year == y3[0]) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), how='left',left_index=True, right_index=True, suffixes=['_3yr_avg','_yr3']) df2012['f2013'] = lin3.predict(df2012.values) df2012.sort('f_yr3', ascending=False, inplace=True) df2012['rnk_2012'] = range(1,df2012.shape[0]+1) df2012.sort('f2013', ascending=False, inplace=True) df2012['rnk_2013'] = range(1,df2012.shape[0]+1) #df2012.to_csv('f2013_projection_3yrs.csv', headers=True,index=True)
import pandas as pd import numpy as np from sklearn.neighbors import RadiusNeighborsRegressor from sklearn import cross_validation # Membaca data training dan test df = pd.read_hdf(sys.argv[1]) tdf = pd.read_hdf(sys.argv[2]) # Mengubah menjadi array numpy yang digunakan scikit-learn X_train = df.as_matrix(['lat', 'lon']) y_train = (df.length.as_matrix())*15 X_test = tdf.as_matrix(['lat', 'lon']) y_test = (tdf.length.as_matrix())*15 id_test = tdf.index.to_series().as_matrix() # Inisialisasi model model = RadiusNeighborsRegressor(radius=0.0005, weights='distance') # Training model.fit(X_train, y_train) # Prediksi y_try = model.predict(X_test) # Penulisan hasil resdf = pd.DataFrame({'idx': id_test, 'predict': (y_try), 'actual': (y_test)}).set_index('idx') resdf.to_csv(sys.argv[3])
class Model: def __init__(self): pass def NN_Build(self, train_x, train_y): """ structure nerual network """ self.NN = MLPRegressor(hidden_layer_sizes=(50), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001) self.NN.fit(train_x, train_y) def NN_Predict(self, test): """ nerual network predict the result with the input data """ pre_result = self.NN.predict(test) return pre_result def DT_Build(self, train_x, train_y): """ decision tree model """ self.DT = tree.DecisionTreeRegressor() self.DT.fit(train_x, train_y) def DT_Predict(self, test): """ predict the result with the input data """ pre_result = self.DT.predict(test) return pre_result def SVM_Build(self, train_x, train_y): """SVM_Build""" self.clf = svm.SVR() self.clf.fit(train_x, train_y) def SVM_Predict(self, test): """SVM_Predict""" pre_result = self.clf.predict(test) return pre_result def KNN_Build(self, train_x, train_y): """KNN_Build""" self.kneigh = KNeighborsRegressor(n_neighbors=2) self.kneigh.fit(train_x, train_y) def KNN_Predict(self, test): """KNN_Predict""" pre_result = self.kneigh.predict(test) return pre_result def RNN_Build(self, train_x, train_y): """RNN_Build""" self.rneigh = RadiusNeighborsRegressor(radius=1.0) self.rneigh.fit(train_x, train_y) def RNN_Predict(self, test): """RNN_Predict""" pre_result = self.kneigh.predict(test) return pre_result def pre_plot(self, train_y, pre_result, start, end, ti): """ compare the predict_result with the true label """ train_y = pd.DataFrame(train_y[start:end], columns=['train_y']) pre_result = pd.DataFrame(pre_result[start:end], columns=['pre_result']) result = pd.concat([train_y, pre_result], axis=1) # print result result.plot(title=ti) plt.show() def Get_MSE(self, test_y, pre_result): """ compute MSE between label & pre_result MSE = sum(pow(test_y-pre_result,2)) / len(test_y) """ diff = [] for i in range(len(test_y)): diff.append(pow(test_y[i] - pre_result[i],2)) result = np.sum(diff) / len(test_y) print 'MSE: ',result def Algorithm_compare(self, train_x, train_y, test_x, test_y): """plot the results precited by the models""" self.NN_Build(train_x, train_y) self.DT_Build(train_x, train_y) self.SVM_Build(train_x, train_y) self.KNN_Build(train_x, train_y) self.RNN_Build(train_x, train_y) self.NN_result = self.NN_Predict(test_x) self.DT_result = self.DT_Predict(test_x) self.SVM_result = self.SVM_Predict(test_x) self.KNN_result = self.KNN_Predict(test_x) self.RNN_result = self.RNN_Predict(test_x) self.pre_plot(test_y, self.NN_result, 0, len(self.NN_result), 'NN_result') self.Evaluate(test_y, self.NN_result) self.pre_plot(test_y, self.DT_result, 0, len(self.DT_result), 'DT_result') self.Evaluate(test_y, self.DT_result) self.pre_plot(test_y, self.SVM_result, 0, len(self.SVM_result), 'SVM_result') self.Evaluate(test_y, self.SVM_result) self.pre_plot(test_y, self.KNN_result, 0, len(self.KNN_result), 'KNN_result') self.Evaluate(test_y, self.KNN_result) self.pre_plot(test_y, self.RNN_result, 0, len(self.RNN_result), 'RNN_result') self.Evaluate(test_y, self.RNN_result) def Evaluate(self, test_y, pre_result): MSE = mean_squared_error(test_y, pre_result) MAE = mean_absolute_error(test_y, pre_result) EVS = explained_variance_score(test_y, pre_result) print 'MSE: ',MSE print 'MAE: ',MAE print 'EVS: ',EVS
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_log_error from functions import errors data = pd.read_csv("forestfires.csv") data = data.drop(labels=['month', 'day'], axis=1) y = data.area x = data.drop(labels=['area'], axis=1) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=10) reg = RadiusNeighborsRegressor() reg.fit(x_train, y_train) y_predict = reg.predict(x_test) for i in range(len(y_predict)): print("pred %s act %s" % (y_predict[i], y_test.ravel()[i])) errors(y_test, y_predict)
def mydist(x, y): distance_assignement = (0. if x[0]==y[0] else 1.) distance_time = (0. if x[2]==y[2] else 1.) distance_day = (0. if x[1]==y[1] else 1.) #distance_week_day = (1 if x[0]==y[0] else 0) #distance_time = abs(x[3] - y[3])%1440 distance = distance_assignement + distance_time + distance_day return distance #dist = neighbors.DistanceMetric.get_metric('pyfunc', func=distance) preprocessing = fp.feature_preprocessing() preprocessing.full_preprocess(used_columns=['ASS_ID', 'WEEK_DAY', 'TIME', 'CSPL_RECEIVED_CALLS']) data = preprocessing.data[:1000] Y = data['CSPL_RECEIVED_CALLS'] X = data.drop(['CSPL_RECEIVED_CALLS'], axis=1) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.1, random_state=0) neigh = RadiusNeighborsRegressor(radius=0.5, metric='pyfunc', func=mydist, algorithm='auto') print('fitting...') neigh.fit(X_train, y_train) print('fitted') #error = neigh.score(X_test, y_test) #print(error) y_pred = neigh.predict(X_test)
from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from math import sqrt print("KNN ...{}".format("")) knnreg = KNeighborsRegressor(n_neighbors=1) knnreg.fit(data, Y) y_KNNpred = knnreg.predict(data) trainrms = sqrt(mean_squared_error(Y, y_KNNpred)) print("KNN PCA : trainrms {}".format(trainrms)) print("Rad ...{}".format("")) from sklearn.neighbors import RadiusNeighborsRegressor radreg = RadiusNeighborsRegressor(weights='distance', radius=10.3) radreg.fit(data, Y) y_radpred = radreg.predict(data) trainrms = sqrt(mean_squared_error(Y, y_radpred)) print("Rad PCA : trainrms {}".format(trainrms)) #============================================================================= # end Feature selection #============================================================================= # Instanciate a Gaussian Process model #kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) print("RF ...{}".format("")) RFregr = RandomForestRegressor(n_estimators=301, random_state=0, oob_score=True)
forest = RandomForestRegressor(n_estimators = 100, n_jobs = 2, oob_score=True) adaboost = AdaBoostRegressor() nb = GaussianNB() rd = RidgeClassifierCV() kf = KFold(report.shape[0], n_folds = 5) for train_index, test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = variables.ix[list(train_index),], variables.ix[list(test_index),] y_train = report['survey_participant'].ix[list(train_index),] y_test = report['survey_participant'].ix[list(test_index),] forest.fit(X_train,y_train) adaboost.fit(X_train,y_train) gdc.fit(X_train, y_train) rd.fit(X_train, y_train) rgr.fit(X_train, y_train) nb.fit(X_train, y_train) lr.fit(X_train, y_train) et.fit(X_train, y_train) #print forest.feature_importances_ y_hat = list(gdc.predict(X_test)) print 'GDC', sum((y_hat-y_test)**2)/float(len(y_test)) y_hat = list(rd.predict(X_test)) print 'RD', sum((y_hat-y_test)**2)/float(len(y_test)) y_hat = list(et.predict(X_test)) print 'ET', sum((y_hat-y_test)**2)/float(len(y_test)) y_hat = list(lr.predict(X_test)) print 'LR', sum((y_hat-y_test)**2)/float(len(y_test)) y_hat = list(forest.predict(X_test)) print 'RFRegressor', sum(((y_hat)-y_test)**2)/float(len(y_test))
# Read training dataset df = pd.read_csv(TRAINING_DATASET, header=None) # read from the first line columns = len(df.columns) rows = len(df.index) print 'Training dataset:', "{:,}".format(len(df.index)), 'x', "{:,}".format(len(df.columns)) df_y = df.ix[:,columns-1] df_x = df.ix[:,:columns-2] X = np.array(df_x) Y = np.array(df_y) neigh = RadiusNeighborsRegressor(radius = KNN_RADIUS) neigh.fit(X, Y) # Read Test dataset testFiles = [file for file in os.listdir(TEST_DATASET_DIRECTORY) if str(file).find('test') >= 0] print 'Number of test files:', len(testFiles) TEST_Y_ALL = np.array([]) TEST_Y_ALL_PREDICTED = np.array([]) for file in testFiles: df = pd.read_csv(TEST_DATASET_DIRECTORY + '/' + file, header=None) # read from the first line df_y = df.ix[:,columns-1] df_x = df.ix[:,:columns-2] X = np.array(df_x) Y = np.array(df_y)
from data.transformed_data import * from data.raw_data import data_dir from sklearn.neighbors import RadiusNeighborsRegressor regressor = RadiusNeighborsRegressor() regressor.fit(train_x, train_y) print('RadiusNeighborsRegressor rmse:{}'.format( RMSLE(validation_y, regressor.predict(validation_x)))) predict = regressor.predict(test[col]) test['visitors'] = np.expm1(predict) test['visitors'] = test['visitors'].clip(lower=0.) test[['id', 'visitors' ]].to_csv(data_dir + 'submission_radius_neighbors_regressor.csv', index=False)
del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, consARR, test_size=1500) myRAD = float(sys.argv[1]) radNN = RadiusNeighborsRegressor(radius=myRAD) #radNN.fit(likesMAT, consARR) radNN.fit(X_train, y_train) y_pred = radNN.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("cons, Radius neighbors: ", str(myRAD), " ", myRMSE) # joblib.dump(radNN, "/Users/jamster/radNN-A-cons.xz", compress=9) # impRadNN = joblib.load("/Users/jamster/radNN-A-cons.xz")
# 加噪音 y[::5] += 3 * (0.5 - np.random.rand(8)) # 拟合模型 svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = SVR(kernel='linear', C=1e3) svr_poly = SVR(kernel='poly', C=1e3, degree=2) knng = KNeighborsRegressor(n_neighbors=6, weights='uniform') rng = RadiusNeighborsRegressor(radius=1.0, weights='uniform') dtr = DecisionTreeRegressor(criterion='mse') abr = AdaBoostRegressor(n_estimators=50) rfr = RandomForestRegressor(n_estimators=50) svr_rbf.fit(X, y), svr_lin.fit(X, y), svr_poly.fit(X, y) knng.fit(X, y), rng.fit(X, y), dtr.fit(X, y) abr.fit(X, y), rfr.fit(X, y) # 支持向量回归 y_rbf = svr_rbf.predict(X) y_lin = svr_lin.predict(X) y_poly = svr_poly.predict(X) # KNN 回归 y_knng = knng.predict(X) y_rng = rng.predict(X) # 决策树回归 y_dtr = dtr.predict(X) # ensemble