def pwlf_test(x, y, breaks): my_pwlf = pwlf.PiecewiseLinFit(x, y) my_pwlf.fit(breaks + 1) yhat = my_pwlf.predict(x) vis(x, y, yhat, '-', title='pwlf') mse = mean_squared_error(y, yhat) print("pwlf mean square error is {}".format(mse))
def piecewise_test(x, y, breaks): my_pr = PiecewiseRegression(breaks) my_pr.fit(x, y) yhat = my_pr.predict(x) vis(x, y, yhat, '-', title='pr') mse = mean_squared_error(y, yhat) print("mypr mean square error is {}".format(mse))
def train(self, x_train, y_train, x_test, y_test): x_train, y_train, x_test, y_test = self._normalize( x_train, y_train, x_test, y_test) # generating X data = np.zeros((len(x_train),2)) data[:, 0] = x_train[:, 0] data[:, 1] = y_train[:, 0] model = BFModel(8) model.load('./pretrained/bfnet.model') start_time = timer() predicted_betas = model.predict(data) predicted_betas = predicted_betas.reshape(y_train.shape[0],) indices = self.find_locations(predicted_betas) self.indices = indices self.lrs = [] for i, xpos in enumerate(indices[1:]): lr_train_data = data[data[:, 0]<xpos] lr_train_data = data[data[:, 0]>indices[i]] lr = PolynomialRegression(1) lr.fit(lr_train_data[:, 0], lr_train_data[:, 1]) print(lr) self.lrs.append(lr) end_time = timer() yhat = [] for each in x_test: yhat.append(self.predict(each)) yhat = np.array(yhat) mse = metrics.mean_squared_error(y_test, yhat) return mse, end_time - start_time
def evaluate_range_query(self, test_range_query): data_size = np.array(test_range_query.shape[0]) build_times = [] mses = [] for idx, model in enumerate(self.models): if (model.name == 'Scipy KD-Tree'): mses.append(0) build_times.append(0) continue start_time = timer() y_pred = np.array( self.predict_range_query(idx, test_range_query.iloc[0, :-1], test_range_query.iloc[-1, :-1])) end_time = timer() if (y_pred.shape[0] != data_size): print( 'Num of predicted entries in range query %d versus expected entries %d', y_pred.shape[0], data_size) mse = -1 else: ytrue = np.array(test_range_query.iloc[:, -1:]) mse = metrics.mean_squared_error(np.sort(y_pred), np.sort(ytrue)) mses.append(mse) if (self.debug_print): print( "{} model tested in {:.4f} seconds with mse {:.4f}".format( model.name, end_time - start_time, mse)) build_times.append(end_time - start_time) return mses, build_times
def evaluate_point(self, test_data): data_size = test_data.shape[0] if self.debug_print: print("[Point Query] Evaluating {} datapoints".format(data_size)) build_times = [] mses = [] for idx, model in enumerate(self.models): ys = [] start_time = timer() for i in range(data_size): y = self.predict(idx, test_data.iloc[i, :-1]) y = int(y // model.page_size) if self.sample_ratio: y = y / self.sample_ratio ys.append(y) # print("Evaluating {}/{}".format(i, data_size), end='\r') end_time = timer() yhat = np.array(ys).reshape(-1, 1) ytrue = np.array(test_data.iloc[:, -1:]) mse = metrics.mean_squared_error(yhat, ytrue) mses.append(mse) if self.debug_print: print( "{} model tested in {:.4f} seconds with mse {:.4f}".format( model.name, end_time - start_time, mse)) build_times.append(end_time - start_time) return mses, build_times
def train(self, x_train, y_train, x_test, y_test): start_time = timer() self.model.fit(x_train, y_train) end_time = timer() yhat = self.model.predict(x_test) mse = metrics.mean_squared_error(y_test, yhat) return mse, end_time - start_time
def train(self, x_train, y_train, x_test, y_test): self.max_pos = np.max(y_train) train_data = (x_train, y_train) # a 2-d array indexed by [stage][model_id] train_datas = [[train_data]] start_time = timer() for stage in range(self.num_of_stages): number_unused_model = 0 self.models.append([]) for model_id in range(self.num_of_models[stage]): if train_datas[stage][model_id][0] is not None: model = self._build_single_model( self.model_types[stage], train_datas[stage][model_id]) self.models[stage].append(model) else: self.models[stage].append(None) if not stage == self.num_of_stages - 1: # if it is not the last stage # prepare dataset for the next stage # the next_xs and next_ys are two dimensional list # indexed by stage_id, model_id next_xs = [[] for i in range(self.num_of_models[stage + 1])] next_ys = [[] for i in range(self.num_of_models[stage + 1])] for index, key in enumerate(x_train): model_id = self.get_staged_output(key, stage) output = self.models[stage][model_id].predict(key) selected_model_id = int( output * self.num_of_models[stage + 1] / self.max_pos) # in case selected_model_id is not in range selected_model_id = self.acceptable_next_model( selected_model_id, stage) # print('selected model id: {}'.format(selected_model_id)) next_xs[selected_model_id].append(key) next_ys[selected_model_id].append(y_train[index]) # prepare data accordingly for next_model_id in range(self.num_of_models[stage + 1]): train_datas.append([]) if len(next_xs[next_model_id]) != 0: dataset = (next_xs[next_model_id], next_ys[next_model_id]) train_datas[stage + 1].append(dataset) else: # there is no x and y allocated # by default, give it all the training data number_unused_model = number_unused_model + 1 # print("[WARN] The model {}-{} is not given any data". # format(stage + 1, next_model_id)) train_datas[stage + 1].append((None, None)) print("unused model at stage {}: {}".format( stage + 1, number_unused_model)) end_time = timer() y_pred = [] for each in x_test: y_pred.append(self.predict(each)) mse = metrics.mean_squared_error(y_test, y_pred) return mse, end_time - start_time
def train(self, x_train, y_train, x_test, y_test): x_train, y_train, x_test, y_test = self._normalize( x_train, y_train, x_test, y_test) start_time = timer() self.net.fit(x_train, y_train, epochs=self.epochs, batch_size=10) end_time = timer() y_hat = self.net.predict(x_test) mse = metrics.mean_squared_error(y_test, y_hat) return mse, end_time - start_time
def train(self, x_train, y_train, x_test, y_test, dim=2): #Build kd tree with train data data_train = np.hstack((x_train, y_train)) data_train = data_train.tolist() build_time = self.build_kd_tree(data_train) # search points kd tree with test data y_predict_test = [] for key in x_test: nearest = self.get_nearest(key, dim=2) y_predict_test.append(nearest[1][-1]) y_predict_test = np.array(y_predict_test) mse = mean_squared_error(y_test, y_predict_test) return mse, build_time
def evaluate_knn_query(self, query, ytrue, k): if self.debug_print: print("[Point Query %d %d] Evaluating %d neighbours" % (query[0], query[1], k)) build_times = [] mses = [] for idx, model in enumerate(self.models): if (model.name == 'Scipy KD-Tree') or (model.name == 'Lisa Baseline'): mses.append(0) build_times.append(0) continue start_time = timer() y_pred = np.array(self.predict_knn_query(idx, query, k)) end_time = timer() ytrue = np.squeeze(ytrue) if (y_pred.shape[0] != np.squeeze(ytrue).shape[0]): print( f'Num of predicted entries in knn query {np.squeeze(y_pred).shape[0]} versus expected {ytrue.shape[0]} entries' ) mse = -1 else: yhat = np.array(y_pred).reshape(-1, 1) mse = metrics.mean_squared_error(yhat, ytrue) if (mse != 0): print(yhat) print('\n\n\n\n') print(ytrue) for i in range(ytrue.shape[0]): if (yhat[i] != ytrue[i]): print(' Predicted y %d Expected y %d' % (yhat[i], ytrue[i])) mses.append(mse) if self.debug_print: print( "{} model tested in {:.4f} seconds with mse {:.4f}".format( model.name, end_time - start_time, mse)) build_times.append(end_time - start_time) return mses, build_times
def train(self, x_train, y_train, x_test, y_test): self.total_data_size = x_train.shape[0] x, y = (list(t) for t in zip(*sorted(zip(x_train, y_train)))) start_time = timer() for i in range(self.total_data_size): self.btree.insert(Item(x[i], y[i])) print('{}/{} inserted into B-Tree'.format(i, self.total_data_size), end='\r') end_time = timer() test_data_size = x_test.shape[0] pred_y = [] for i in range(test_data_size): pred_y.append( self.btree.search(x_test[i])[2].value // self.page_size) print('{}/{} tested B-Tree'.format(i, test_data_size), end='\r') pred_y = np.array(pred_y) mse = metrics.mean_squared_error(y_test, pred_y) return mse, end_time - start_time
def train(self, x_train, y_train, x_test, y_test): #Build kd tree with train data start_time = timer() self.build(x_train) end_time = timer() build_time = end_time - start_time self.y_train = y_train self.x_train = x_train mse = 0.0 y_predict_test = [] # data_test=np.hstack((x_test, y_test)) for key in x_test: pred = self.predict(key) y_predict_test.append(pred) mse = metrics.mean_squared_error(y_test, y_predict_test) return mse, build_time
def train(self, x_train, y_train, x_test, y_test): print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) np.set_printoptions(threshold=1000) start_time = timer() self.train_array = np.hstack((x_train, y_train.reshape(-1, 1), np.zeros((x_train.shape[0], 1), dtype=x_train.dtype))) self.train_array = self.train_array.astype('float64') # Apply mapping function to 2 dimenional key values self.mapping_function() # Sort the input data array with mapped values self.train_array = self.train_array[self.train_array[:, 3].argsort()] #self.plot_function(in_data_arr) #Init dense array with sorted mapped values(Store first and last key per page) if (self.init_dense_array() == -1): return -1, timer() - start_time end_time = timer() print('/n build time %f' % (end_time - start_time)) test_data_size = x_test.shape[0] pred_y = [] #for i in range(20): print('\n In Lisabaseline.build evaluation %d data points' % (test_data_size)) for i in range(test_data_size): pred_y.append(self.predict(x_test[i])) pred_y = np.array(pred_y) mse = metrics.mean_squared_error(y_test, pred_y) return mse, end_time - start_time
def train(self, x_train, y_train, x_test, y_test): x_train, y_train, x_test, y_test = self._normalize( x_train, y_train, x_test, y_test) # generating X data = np.zeros((len(x_train), 2)) data[:, 0] = x_train[:, 0] data[:, 1] = y_train[:, 0] model = BFModel(8) model.load('./pretrained/bfnet.model') start_time = timer() predicted_betas = model.predict(data) predicted_betas = predicted_betas.reshape(y_train.shape[0], ) indices = self.find_locations(predicted_betas) self.indices = indices self.lrs = [] self.model = pwlf.PiecewiseLinFit(data[:, 0], data[:, 1]) self.model.fit_with_breaks(indices) end_time = timer() yhat = [] for each in x_test: yhat.append(self.predict(each)) yhat = np.array(yhat) mse = metrics.mean_squared_error(y_test, yhat) return mse, end_time - start_time
for t in test: result.append(tuple(get_knn(kd_tree, t, 5, dim, dist_sq_dim))) dis_grnd_truth = sklearn_kdtree(points, dim) print(result, "result") list_result = [] print(len(result)) for i in range(len(result[0])): list_result.append(result[0][i][0]) print(list_result, 'list_result') # print(t_end-t_start, "Time taken") print(dis_grnd_truth[0], "dis_grnd_truth") mse_error = mean_squared_error(dis_grnd_truth[0], list_result, squared=False) print(mse_error, 'mse_error') # def bench1(): # kd_tree = make_kd_tree(points, dim) # for point in additional_points: # add_point(kd_tree, point, dim) # # result1.append(tuple(get_knn(kd_tree, [0] * dim, 1, dim, dist_sq_dim))) # for t in test: # result1.append(tuple(get_knn(kd_tree, t, 1, dim, dist_sq_dim))) # def bench2(): # all_points = points + additional_points # result2.append(tuple(get_knn_naive(all_points, [0] * dim, 8, dist_sq_dim))) # for t in test:
def calculate_error(self, X, y, alphas, betas): _, A = self._calculate_alphas(betas, X, y) yhat = A @ alphas mse = mean_squared_error(yhat, y) return mse