def createRadiusNeighborsRegressor(params=None): info("Creating Radius Neighbors Regressor", ind=4) error("This doesn't work") return {"estimator": None, "params": None} ## Params params = mergeParams(RadiusNeighborsRegressor(), params) tuneParams = getRadiusNeighborsRegressorParams() grid = tuneParams['grid'] info("With Parameters", ind=4) algorithm = setParam('algorithm', params, grid, force=False) info("Param: algorithm = {0}".format(algorithm), ind=6) leaf_size = setParam('leaf_size', params, grid, force=False) info("Param: leaf_size = {0}".format(leaf_size), ind=6) metric = setParam('metric', params, grid, force=False) info("Param: metric = {0}".format(metric), ind=6) radius = setParam('radius', params, grid, force=False) info("Param: radius = {0}".format(radius), ind=6) weights = setParam('weights', params, grid, force=False) info("Param: weights = {0}".format(weights), ind=6) ## Estimator reg = RadiusNeighborsRegressor(algorithm=algorithm, leaf_size=leaf_size, metric=metric, radius=radius, weights=weights) return {"estimator": reg, "params": tuneParams}
def sampling_fix(df, name, start, stop, radius, medianFilter, plot): #Filter dataset based on depth range df = df[(df['Measured Depth m'] > start) & (df['Measured Depth m'] < stop)] #remove NaNs from dataset df = df[np.isfinite(df[name])] X = df['Measured Depth m'] #reshape the depth to matcch regressor requirements X = X.values.reshape(X.shape[0], 1) from sklearn.neighbors import RadiusNeighborsRegressor #define regressor with provided radius reg = RadiusNeighborsRegressor(radius=radius, weights='uniform') #apply median filter with back filling (to remove NaNs at the beginning of dataset) y = df[name].rolling(medianFilter).median().bfill() #fit regressor reg.fit(X, y) #check if plotting was required or should the model be returned if plot == 0: return reg else: import matplotlib.pyplot as plt #plot the chart. Original data is plotted as well as the regression data. plt.scatter(X, y, label=name) plt.plot(X, reg.predict(X), c='r', label="prediction") plt.legend() plt.show()
def get_best_rnn_radius(low, high, step): """ Return the best radius value in step range [low, high] to be used in rnn algorithm. """ radii = [] mae_rnn = [] for r in np.arange(low, high + step, step): rnn_regressor = RadiusNeighborsRegressor(radius=r, weights='distance') rnn_regressor.fit(train_df[['temperatura', 'vacuo']], train_df[['energia']]) energia_rnn = rnn_regressor.predict(test_df[['temperatura', 'vacuo']]) radii.append(r) mae_rnn.append( metrics.mean_absolute_error(test_df['energia'], energia_rnn)) best_radius = radii[np.argmin(mae_rnn)] fig, ax = plt.subplots() ax.set_title('Parameter evaluation for RNN') ax.set_xlabel('Radius') ax.set_ylabel('Mean absolute error') ax.set_xlim(low, high) ax.set_xticks(list(ax.get_xticks()) + [best_radius]) ax.plot(radii, mae_rnn, c='orange', linewidth=2) fig.savefig('rnn_param.png') return best_radius
def rNeighbours2dPlot(X,y,r=0.5,res=100,dist_scale='normalize',im_kws={},reg_kws={},ax=None): if isinstance(X,pd.core.frame.DataFrame): X = X.values if 'origin' not in reg_kws: im_kws['origin'] ='lower' if 'extent' not in im_kws: im_kws['extent'] = (X[:,0].min(),X[:,0].max(),X[:,1].min(),X[:,1].max()) if 'aspect' not in im_kws: im_kws['aspect'] = (X[:,0].max()-X[:,0].min())/(X[:,1].max()-X[:,1].min()) if dist_scale is not None: if dist_scale == 'normalize': X = X/(X.max(axis=0) - X.min(axis=0)) else: X = X/dist_scale kneighbours = RadiusNeighborsRegressor(radius=r,**reg_kws) kneighbours.fit(X,y) xx,yy = np.meshgrid(np.linspace(X[:,0].min(),X[:,0].max(),res),np.linspace(X[:,1].min(),X[:,1].max(),res)) X_grid = np.vstack([xx.ravel(),yy.ravel()]).T y_hat = kneighbours.predict(X_grid) Y_hat = y_hat.reshape((res,res)) if ax is None: return plt.imshow(Y_hat,**im_kws) else: return ax.imshow(Y_hat,**im_kws)
def __init__(self, args, env_params): # Save args self.args, self.env_params = args, env_params # Create the KNN model self.knn_model = RadiusNeighborsRegressor(radius=args.neighbor_radius, weights='uniform') # Flag self.is_fit = False
def __init__(self, in_dim, radius, out_dim): # Save args self.in_dim = in_dim self.radius = radius self.out_dim = out_dim # Create the KNN model self.knn_model = RadiusNeighborsRegressor(radius=radius, weights='uniform', metric='manhattan') # Flag self.is_fit = False
def plot_std_dev(folder): data = load_images(folder) for channel in range(data.shape[3]): channel_stack = data[:, :, :, channel] std_dev_img = np.std(channel_stack, axis=0) mean_img = np.mean(channel_stack, axis=0) # print(std_dev_img) # print(np.mean(std_dev_img)) if 1: plt.subplot(2, 2, 1) # plt.imshow(mean_img) display_image(mean_img, z=1) plt.title('mean') plt.subplot(2, 2, 2) display_image(std_dev_img, z=1) plt.title('std') plt.subplot(2, 2, 3) display_image(mean_img / std_dev_img, z=1) plt.title('mean / std') plt.subplot(2, 2, 4) bins = np.arange(np.min(channel_stack), np.max(channel_stack) + 1) plt.hist(channel_stack.flatten(), bins=bins) plt.grid(True) plt.show() # skip = 10 # for img_channel in channel_stack: # plt.scatter(img_channel.flatten()[::skip], mean_img.flatten()[::skip], alpha = 0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=10, weights='distance') rnr.fit(np.expand_dims(mean_img.flatten(), axis=1), std_dev_img.flatten()) line_x = np.arange(np.min(mean_img), np.max(mean_img) + 1) line_y = rnr.predict(np.expand_dims(line_x, axis=1)) fit = np.polyfit(mean_img.flatten(), std_dev_img.flatten(), deg=1) linear_y = np.polyval(fit, line_x) # for d in range(deg+1): # fits[y//n, :, channel, d] = section_fits[d] plt.scatter(mean_img.flatten(), std_dev_img.flatten(), alpha=0.1, color='black', s=1) plt.plot(line_x, line_y, 'r') plt.plot(line_x, linear_y, 'orange') plt.grid(True) plt.show()
class KNNDynamicsResidual: def __init__(self, args, env_params): # Save args self.args, self.env_params = args, env_params # Create the KNN model self.knn_model = RadiusNeighborsRegressor(radius=args.neighbor_radius, weights='uniform') # Flag self.is_fit = False def fit(self, X, y): ''' X should be the data matrix N x d, where each row is a 4D vector consisting of object pos and gripper pos y should be target matrix N x d, where each row is a 4D vector consisting of next object pos and next gripper pos ''' self.knn_model.fit(X, y) self.is_fit = True return self.loss(X, y) def predict(self, X): ''' X should be the data matrix N x d, where each row is a 4D vector consisting of object pos and gripper pos ''' ypred = np.zeros(X.shape) if not self.is_fit: # KNN model is not fit return ypred # Get neighbors of X neighbors = self.knn_model.radius_neighbors(X) # Check if any of the X doesn't have any neighbors by getting nonzero mask neighbor_mask = [x.shape[0] != 0 for x in neighbors[1]] # If none of X has any neighbors if X[neighbor_mask].shape[0] == 0: return ypred # Else, for the X that have neighbors use the KNN prediction ypred[neighbor_mask] = self.knn_model.predict(X[neighbor_mask]) return ypred def get_num_neighbors(self, X): if not self.is_fit: return np.zeros(X.shape[0]) neighbors = self.knn_model.radius_neighbors(X) num_neighbors = np.array([x.shape[0] for x in neighbors[1]]) return num_neighbors def loss(self, X, y): ypred = self.predict(X) # Loss is just the mean distance between predictions and true targets loss = np.linalg.norm(ypred - y, axis=1).mean() return loss
def estimate_ns_act(self, term, coords=None, **kwargs): """ Uses KNN to estimate Neurosynth term activation (tf-idf) at specified coordinates. If no coordinates are passed, ABA sampled locations in corresponding NsabaBase are used. Parameters ---------- term : str NS term whose activation is to be estimated coords : np.array [int], optional Coordinates where NS term activation is to be estimated. kwargs : dict, optional 'rnn_args' : dict SKLearn RadiusNeighborsRegressor() optional arguments. http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html for default arguments. """ if not self.is_term(term): raise ValueError("'%s' is not a registered term." % term) self.term[term] = {} if coords is None: coords = self._aba['mni_coords'].data self.term[term]['coord_type'] = 'ABA MNI' else: self.term[term]['coords'] = coords if 'coord_type' in kwargs: self.term[term]['coord_type'] = kwargs['coord_type'] ns_coord_tree, ns_coord_act_df = self._term_to_coords(term, 0) if 'rnn_args' in kwargs: if 'radius' not in kwargs['rnn_args']: kwargs['rnn_args']['radius'] = 5 self.term[term]['classifier'] = RadiusNeighborsRegressor( **kwargs['rnn_args']) else: self.term[term]['classifier'] = RadiusNeighborsRegressor(radius=5) X = ns_coord_tree.data y = ns_coord_act_df[term].as_matrix() self.term[term]['classifier'].fit(X, y) with warnings.catch_warnings(): warnings.simplefilter("ignore") self.term[term]['act'] = self.term[term]['classifier'].predict( coords)
def test_onnx_simple_text_plot_knnr(self): x = numpy.random.randn(10, 3) y = numpy.random.randn(10) model = RadiusNeighborsRegressor(3) model.fit(x, y) onx = to_onnx(model, x.astype(numpy.float32), target_opset=15) text = onnx_simple_text_plot(onx, verbose=False) expected = " Neg(arange_y0) -> arange_Y0" self.assertIn(expected, text) self.assertIn(", to=7)", text) self.assertIn(", keepdims=0)", text) self.assertIn(", perm=[1,0])", text)
def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs): _RadiusNeighborsRegressor.__init__(self, radius, weights, algorithm, leaf_size, p, metric, metric_params, **kwargs) BaseWrapperReg.__init__(self)
class _RadiusNeighborsRegressorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def GridSearchCVRadiusNeighborsRegressor(X_train, y_train): d = [i for i in range(1, 100)] d.append(None) param_grid = { 'weights': ['uniform', 'distance'], 'radius': range(1, 100), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], } model = RadiusNeighborsRegressor() scores = [ 'r2', ] reg = GridSearchCV(model, cv=3, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=scores, refit='r2', iid=True) reg.fit(X_train, y_train) return reg.best_estimator_, reg.best_params_, reg.best_score_
def test_model_knn_regressor_radius(self): model, X = self._fit_model(RadiusNeighborsRegressor()) model_onnx = convert_sklearn(model, "KNN regressor", [("input", FloatTensorType([None, 4]))], target_opset=TARGET_OPSET, options={id(model): {'optim': 'cdist'}}) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X.astype(numpy.float32)})[0] exp = model.predict(X.astype(numpy.float32)) if any(numpy.isnan(got.ravel())): # The model is unexpectedly producing nan values # not on all platforms. rows = ['--EXP--', str(exp), '--GOT--', str(got), '--EVERY-OUTPUT--'] for out in enumerate_model_node_outputs( model_onnx, add_node=False): onx = select_model_inputs_outputs(model_onnx, out) sess = InferenceSession(onx.SerializeToString()) res = sess.run( None, {'input': X.astype(numpy.float32)}) rows.append('--{}--'.format(out)) rows.append(str(res)) if (StrictVersion(onnxruntime.__version__) < StrictVersion("1.4.0")): return raise AssertionError('\n'.join(rows)) assert_almost_equal(exp.ravel(), got.ravel(), decimal=3)
def __init__(self, regression=True, radius=1.0, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None): self._regression = regression self._radius = radius self._weights = weights self._algorithm = algorithm self._leaf_size = leaf_size self._p = p self._metric = metric self._metric_params = metric_params self._outlier_label = outlier_label if regression: self._model = RadiusNeighborsRegressor(radius, weights, algorithm, leaf_size, p, metric, metric_params) else: self._model = RadiusNeighborsClassifier(radius, weights, algorithm, leaf_size, p, metric, metric_params) return super().__init__()
def fit(self, featureMatrix, labels): # locally weighted regression if self.method.lower() == 'lwr': kernel = self.params['kernel'] fit_intercept = self.params['fit_intercept'] alpha = self.params['alpha'] self.model = local_regression.LWRegressor(kernel=kernel, alpha=alpha, fit_intercept=fit_intercept) # radius neighbors regression elif self.method.lower() == 'rnn': radius = self.params['radius'] weights = self.params['weights'] leaf_size = self.params['leaf_size'] self.model = RadiusNeighborsRegressor(radius=radius, weights=weights, leaf_size=leaf_size) # k-nearest neighbors regression elif self.method.lower() == 'knn': n_neighbors = self.params['n_neighbors'] weights = self.params['weights'] leaf_size = self.params['leaf_size'] p = self.params['p'] self.model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size, p=p) # linear regression else: self.model = linear_model.LinearRegression(fit_intercept=False) # fit model to data self.model.fit(featureMatrix, labels)
def _check_coords_for_distance_weighting(self, coords, check_radius, check_weights, X, y_mean): """ Checks that coords won't break the distance weighting function """ valid_inds = [] for coord in xrange(len(coords)): temp = RadiusNeighborsRegressor(radius=check_radius, weights=check_weights) temp.fit(X, y_mean) try: temp.predict([coords[coord]]) valid_inds.append(coord) except ZeroDivisionError: continue return valid_inds
def get_hyperparameters_model(): param_dist = { } clf = RadiusNeighborsRegressor() model = {'radius_neighbors_regressor': {'model': clf, 'param_distributions': param_dist}} return model
def get_author_list_with_pruning_method(feature_list, author_list, qp, radius): """ feature_list - the feature list to indicate the stylometric features author_list - the author list to indicate a paragraph is written by whom qp - the query point, mostly represents a document This function will return a shortened author list, which can greatly reduce the size of training set by removing those data points too far from the query point. Since it takes time to calculate the Hausdorff distance, reducing the size of testing set can speed up the process Please refer to the following link for more information http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsRegressor.html#sklearn.neighbors.RadiusNeighborsRegressor """ neigh = RadiusNeighborsRegressor(radius=radius, algorithm='brute', p=2) neigh.fit(feature_list, author_list) return neigh.radius_neighbors(qp, return_distance=True)
def build_model(args, C, seed): if args.dc_tree: model = DecisionTreeRegressor(random_state=seed) elif args.nn_radius: model = RadiusNeighborsRegressor(radius=1.0) else: model = svm.LinearSVR(C=complexities[comp], random_state=seed) return model
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ algorithm = RadiusNeighborsRegressor( radius=get_ohe_config().rnr_radius) algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def test_model_knn_regressor_yint_radius(self): model, X = self._fit_model( RadiusNeighborsRegressor(), label_int=True) model_onnx = convert_sklearn(model, "KNN regressor", [("input", FloatTensorType([None, 4]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32)[:7], model, model_onnx, basename="SklearnRadiusNeighborsRegressorYInt")
def compare_multiple_stacks(folder): subfolders = os.listdir(folder) all_data = [] for subfolder in tqdm.tqdm(subfolders): all_data.append(load_images(os.path.join(folder, subfolder))) all_data = np.array(all_data) print(all_data.shape) for channel in range(3): for subfolder_index in range(all_data.shape[0]): channel_stack = all_data[subfolder_index][:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 skip = 1 flat_ratios = img_sigma_ratio.flatten()[::skip] mean_values = img_mean.flatten()[::skip] # plt.scatter(mean_values, flat_ratios, alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='uniform') rnr.fit(np.expand_dims(mean_values, axis=1), flat_ratios.flatten()) x = np.arange( np.min(mean_values) + 200, np.max(mean_values) + 1 - 200, 10) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, label=str(subfolder_index)) plt.legend() plt.grid(True) plt.show()
def compare_error_vs_brightness(folder): data = load_images(folder) for channel in range(data.shape[3]): channel_stack = data[:, :, :, channel] img_mean = np.mean(channel_stack, axis=0) img_sigma_clip = np.mean(astropy.stats.sigma_clip(channel_stack, sigma=2, axis=0), axis=0) img_sigma_ratio = (img_mean / img_sigma_clip - 1) * 1E3 x = np.arange(np.min(img_mean), np.max(img_mean) + 1) bit_flip_change = 128 if channel == 1 else 256 y_top = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x - bit_flip_change) - 1) * 1E3 y_bottom = ((channel_stack.shape[0] * x) / (channel_stack.shape[0] * x + bit_flip_change) - 1) * 1E3 plt.plot(x, y_top, 'r') plt.plot(x, y_bottom, 'r') plt.scatter(img_mean.flatten(), img_sigma_ratio.flatten(), alpha=0.1, color='black', s=1) rnr = RadiusNeighborsRegressor(radius=50, weights='distance') rnr.fit(np.expand_dims(img_mean.flatten(), axis=1), img_sigma_ratio.flatten()) x = np.arange(np.min(img_mean), np.max(img_mean) + 1) line_y = rnr.predict(np.expand_dims(x, axis=1)) plt.plot(x, line_y, 'g') plt.grid(True) plt.show()
def RadiusNeighborsRegressor(radius=1.0, weights='distance', algorithm='auto', p=2): model = RadiusNeighborsRegressor(radius=radius, weights=weights, algorithm=algorithm, leaf_size=30, p=p, metric='minkowski', metric_params=None) return model
def grid_points_2d(mesh, cell_size=10): grid = vtk_Voxel.from_mesh(mesh, cell_size, 2) cells = grid.cell_centers().points radius = cell_size * 0.5 tmat = np.full(cells.shape[0], np.nan) print("sample min", np.min(mesh.points[:, 2]), "max", np.max(mesh.points[:, 2])) while np.any(np.isnan(tmat)): # keep increasing radius until all cells have values radius *= 1.5 print("RadiusNeighborsRegressor =", radius, "m") neigh = RadiusNeighborsRegressor(radius, 'distance') neigh.fit(mesh.points[:, :2], mesh.points[:, 2]) rmat = neigh.predict(cells[:, :2]) np.putmask(tmat, np.isnan(tmat), rmat) print("regression min", np.min(tmat), "max", np.max(tmat)) grid.cell_arrays['Elevation'] = tmat surf = grid.extract_surface() surf = surf.ctp() surf.points[:, 2] = surf.point_arrays['Elevation'] return surf
def powerproduction(): if fl.request.method == "POST": speed = {} speed = float(fl.request.form['speed']) # speed = requests.get(data['input_s']) # import csv data and convert to pandas dataframe df = pd.read_csv("powerproduction.csv") # remove all zeros df = df[df.power != 0] # put rows in order of speed df = df.sort_values('speed') # set each column to a numpy array for processing S = df['speed'].to_numpy() p = df['power'].to_numpy() neigh_radius = RadiusNeighborsRegressor(radius=1.7, weights='distance', p = 2) neigh_radius.fit(S.reshape(-1, 1), p) p_pred = neigh_radius.predict([[speed]]) return {'value': p_pred[0]}
def test_model_knn_regressor2_1_radius(self): model, X = self._fit_model_simple( RadiusNeighborsRegressor(algorithm="brute"), n_targets=2) X = X[:-1] model_onnx = convert_sklearn( model, "KNN regressor", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X.astype(numpy.float32)})[0] exp = model.predict(X.astype(numpy.float32)) if any(numpy.isnan(got.ravel())): # The model is unexpectedly producing nan values # not on all platforms. # It happens when two matrices are multiplied, # one is (2, 20, 20), second is (20, 20) # and contains only 0 or 1 values. # The output contains nan values on the first row # but not on the second one. rows = [ '--EXP--', str(exp), '--GOT--', str(got), '--EVERY-OUTPUT--' ] for out in enumerate_model_node_outputs(model_onnx, add_node=False): onx = select_model_inputs_outputs(model_onnx, out) sess = InferenceSession(onx.SerializeToString()) res = sess.run(None, {'input': X.astype(numpy.float32)}) rows.append('--{}--'.format(out)) rows.append(str(res)) if (onnxruntime.__version__.startswith('1.4.') or onnxruntime.__version__.startswith('1.5.')): # TODO: investigate the regression in onnxruntime 1.4 # One broadcasted multiplication unexpectedly produces nan. whole = '\n'.join(rows) if "[ nan" in whole: warnings.warn(whole) return raise AssertionError(whole) if (onnxruntime.__version__.startswith('1.3.') and sys.platform == 'win32'): # Same error but different line number for further # investigation. raise AssertionError(whole) raise AssertionError('\n'.join(rows)) assert_almost_equal(exp, got, decimal=5)
def test_model_knn_regressor_double_radius(self): model, X = self._fit_model(RadiusNeighborsRegressor()) model_onnx = convert_sklearn( model, "KNN regressor", [("input", DoubleTensorType([None, 4]))], target_opset=TARGET_OPSET, options={id(model): {'optim': 'cdist'}}) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float64)[:7], model, model_onnx, basename="SklearnRadiusNeighborsRegressor64") dump_data_and_model( (X + 0.1).astype(numpy.float64)[:7], model, model_onnx, basename="SklearnRadiusNeighborsRegressor64")
def test_model_knn_regressor_weights_distance_11_radius(self): model, X = self._fit_model_simple( RadiusNeighborsRegressor( weights="distance", algorithm="brute", radius=100)) for op in sorted(set([TARGET_OPSET, 12, 11])): if op > TARGET_OPSET: continue with self.subTest(opset=op): model_onnx = convert_sklearn( model, "KNN regressor", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=op) self.assertIsNotNone(model_onnx) sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X.astype(numpy.float32)})[0] exp = model.predict(X.astype(numpy.float32)) assert_almost_equal(exp, got.ravel(), decimal=3)
def initializeModel(name, param_1=5, neighbors=5, radius=1.0, weights='uniform'): if (name == 'knn'): model = KNeighborsClassifier(n_neighbors=param_1) elif (name == 'tree'): model = tree.DecisionTreeClassifier() elif (name == 'forest'): model = RandomForestClassifier() elif (name == 'knnr'): model = KNeighborsRegressor(n_neighbors=neighbors) elif (name == 'rnr'): model = RadiusNeighborsRegressor(radius=radius, weights=weights, n_jobs=-1) return model
def mydist(x, y): distance_assignement = (0. if x[0]==y[0] else 1.) distance_time = (0. if x[2]==y[2] else 1.) distance_day = (0. if x[1]==y[1] else 1.) #distance_week_day = (1 if x[0]==y[0] else 0) #distance_time = abs(x[3] - y[3])%1440 distance = distance_assignement + distance_time + distance_day return distance #dist = neighbors.DistanceMetric.get_metric('pyfunc', func=distance) preprocessing = fp.feature_preprocessing() preprocessing.full_preprocess(used_columns=['ASS_ID', 'WEEK_DAY', 'TIME', 'CSPL_RECEIVED_CALLS']) data = preprocessing.data[:1000] Y = data['CSPL_RECEIVED_CALLS'] X = data.drop(['CSPL_RECEIVED_CALLS'], axis=1) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.1, random_state=0) neigh = RadiusNeighborsRegressor(radius=0.5, metric='pyfunc', func=mydist, algorithm='auto') print('fitting...') neigh.fit(X_train, y_train) print('fitted') #error = neigh.score(X_test, y_test) #print(error) y_pred = neigh.predict(X_test)
from sklearn.naive_bayes import MultinomialNB from sklearn import svm from sklearn.neighbors import KNeighborsRegressor from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import RidgeClassifierCV from sklearn.ensemble import GradientBoostingClassifier gdc = GradientBoostingClassifier() lr = LogisticRegression() clf = svm.SVR() et = ExtraTreesClassifier() rgr = RadiusNeighborsRegressor() forest = RandomForestRegressor(n_estimators = 100, n_jobs = 2, oob_score=True) adaboost = AdaBoostRegressor() nb = GaussianNB() rd = RidgeClassifierCV() kf = KFold(report.shape[0], n_folds = 5) for train_index, test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = variables.ix[list(train_index),], variables.ix[list(test_index),] y_train = report['survey_participant'].ix[list(train_index),] y_test = report['survey_participant'].ix[list(test_index),] forest.fit(X_train,y_train) adaboost.fit(X_train,y_train) gdc.fit(X_train, y_train) rd.fit(X_train, y_train)
import pandas as pd import numpy as np from sklearn.neighbors import RadiusNeighborsRegressor from sklearn import cross_validation # Membaca data training dan test df = pd.read_hdf(sys.argv[1]) tdf = pd.read_hdf(sys.argv[2]) # Mengubah menjadi array numpy yang digunakan scikit-learn X_train = df.as_matrix(['lat', 'lon']) y_train = (df.length.as_matrix())*15 X_test = tdf.as_matrix(['lat', 'lon']) y_test = (tdf.length.as_matrix())*15 id_test = tdf.index.to_series().as_matrix() # Inisialisasi model model = RadiusNeighborsRegressor(radius=0.0005, weights='distance') # Training model.fit(X_train, y_train) # Prediksi y_try = model.predict(X_test) # Penulisan hasil resdf = pd.DataFrame({'idx': id_test, 'predict': (y_try), 'actual': (y_test)}).set_index('idx') resdf.to_csv(sys.argv[3])
print "Train: ", lin3.score(X_train, y_train) print "Test: ", lin3.score(X_test, y_test) print "Intercept: ", lin3.intercept_ for k, v in enumerate(lin3.coef_[0]): print threeYrXcol[k], ": ", v # KNeighborsRegressor kn3 = KNReg(weights='uniform') #kn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) kn3.fit(X_train, y_train) print "Train: ", kn3.score(X_train, y_train) print "Test: ", kn3.score(X_test, y_test) # print kn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # RadiusNeighborsRegressor rn3 = RNReg(radius=7.0) #rn3.fit(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) rn3.fit(X_train, y_train) print "Train: ", rn3.score(X_train, y_train) print "Test: ", rn3.score(X_test, y_test) print rn3.score(df_3avg[threeYrXcol].values, df_3avg[threeYrycol].values) # Test 2010/11/12 stats and 2013 projections against 2013 actuals y=2013 y3 = [y-1,y-2,y-3] tms_include = np.intersect1d(df[df.Year == y3[0]].Team.values, df[df.Year == y3[2]].Team.values) df2012 = pd.merge(df[(df.Year.isin(y3)) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), df[(df.Year == y3[0]) & (df.Team.isin(tms_include))].groupby('Team')[Xvar].mean(), how='left',left_index=True, right_index=True, suffixes=['_3yr_avg','_yr3']) df2012['f2013'] = lin3.predict(df2012.values) df2012.sort('f_yr3', ascending=False, inplace=True) df2012['rnk_2012'] = range(1,df2012.shape[0]+1) df2012.sort('f2013', ascending=False, inplace=True)