def pwlf_test(x, y, breaks):
    my_pwlf = pwlf.PiecewiseLinFit(x, y)
    my_pwlf.fit(breaks + 1)
    yhat = my_pwlf.predict(x)
    vis(x, y, yhat, '-', title='pwlf')
    mse = mean_squared_error(y, yhat)
    print("pwlf mean square error is {}".format(mse))
def piecewise_test(x, y, breaks):
    my_pr = PiecewiseRegression(breaks)
    my_pr.fit(x, y)
    yhat = my_pr.predict(x)
    vis(x, y, yhat, '-', title='pr')
    mse = mean_squared_error(y, yhat)
    print("mypr mean square error is {}".format(mse))
 def train(self, x_train, y_train, x_test, y_test):
     x_train, y_train, x_test, y_test = self._normalize(
         x_train, y_train, x_test, y_test)
     # generating X
     data = np.zeros((len(x_train),2))
     data[:, 0] = x_train[:, 0]
     data[:, 1] = y_train[:, 0]
     model = BFModel(8)
     model.load('./pretrained/bfnet.model')
     start_time = timer()
     predicted_betas = model.predict(data)
     predicted_betas = predicted_betas.reshape(y_train.shape[0],)
     indices = self.find_locations(predicted_betas)
     self.indices = indices
     self.lrs = []
     for i, xpos in enumerate(indices[1:]):
         lr_train_data = data[data[:, 0]<xpos]
         lr_train_data = data[data[:, 0]>indices[i]]
         lr = PolynomialRegression(1)
         lr.fit(lr_train_data[:, 0], lr_train_data[:, 1])
         print(lr)
         self.lrs.append(lr)
     end_time = timer()
     yhat = []
     for each in x_test:
         yhat.append(self.predict(each))
     yhat = np.array(yhat)
     mse = metrics.mean_squared_error(y_test, yhat)
     return mse, end_time - start_time
    def evaluate_range_query(self, test_range_query):
        data_size = np.array(test_range_query.shape[0])
        build_times = []
        mses = []
        for idx, model in enumerate(self.models):
            if (model.name == 'Scipy KD-Tree'):
                mses.append(0)
                build_times.append(0)
                continue
            start_time = timer()
            y_pred = np.array(
                self.predict_range_query(idx, test_range_query.iloc[0, :-1],
                                         test_range_query.iloc[-1, :-1]))
            end_time = timer()
            if (y_pred.shape[0] != data_size):
                print(
                    'Num of predicted entries in range query %d versus expected entries %d',
                    y_pred.shape[0], data_size)
                mse = -1

            else:
                ytrue = np.array(test_range_query.iloc[:, -1:])
                mse = metrics.mean_squared_error(np.sort(y_pred),
                                                 np.sort(ytrue))

            mses.append(mse)
            if (self.debug_print):
                print(
                    "{} model tested in {:.4f} seconds with mse {:.4f}".format(
                        model.name, end_time - start_time, mse))
            build_times.append(end_time - start_time)

        return mses, build_times
 def evaluate_point(self, test_data):
     data_size = test_data.shape[0]
     if self.debug_print:
         print("[Point Query] Evaluating {} datapoints".format(data_size))
     build_times = []
     mses = []
     for idx, model in enumerate(self.models):
         ys = []
         start_time = timer()
         for i in range(data_size):
             y = self.predict(idx, test_data.iloc[i, :-1])
             y = int(y // model.page_size)
             if self.sample_ratio:
                 y = y / self.sample_ratio
             ys.append(y)
             # print("Evaluating {}/{}".format(i, data_size), end='\r')
         end_time = timer()
         yhat = np.array(ys).reshape(-1, 1)
         ytrue = np.array(test_data.iloc[:, -1:])
         mse = metrics.mean_squared_error(yhat, ytrue)
         mses.append(mse)
         if self.debug_print:
             print(
                 "{} model tested in {:.4f} seconds with mse {:.4f}".format(
                     model.name, end_time - start_time, mse))
         build_times.append(end_time - start_time)
     return mses, build_times
 def train(self, x_train, y_train, x_test, y_test):
     start_time = timer()
     self.model.fit(x_train, y_train)
     end_time = timer()
     yhat = self.model.predict(x_test)
     mse = metrics.mean_squared_error(y_test, yhat)
     return mse, end_time - start_time
    def train(self, x_train, y_train, x_test, y_test):
        self.max_pos = np.max(y_train)
        train_data = (x_train, y_train)
        # a 2-d array indexed by [stage][model_id]
        train_datas = [[train_data]]
        start_time = timer()
        for stage in range(self.num_of_stages):
            number_unused_model = 0
            self.models.append([])
            for model_id in range(self.num_of_models[stage]):
                if train_datas[stage][model_id][0] is not None:
                    model = self._build_single_model(
                        self.model_types[stage], train_datas[stage][model_id])
                    self.models[stage].append(model)
                else:
                    self.models[stage].append(None)
            if not stage == self.num_of_stages - 1:
                # if it is not the last stage
                # prepare dataset for the next stage
                # the next_xs and next_ys are two dimensional list
                # indexed by stage_id, model_id
                next_xs = [[] for i in range(self.num_of_models[stage + 1])]
                next_ys = [[] for i in range(self.num_of_models[stage + 1])]
                for index, key in enumerate(x_train):
                    model_id = self.get_staged_output(key, stage)
                    output = self.models[stage][model_id].predict(key)
                    selected_model_id = int(
                        output * self.num_of_models[stage + 1] / self.max_pos)
                    # in case selected_model_id is not in range
                    selected_model_id = self.acceptable_next_model(
                        selected_model_id, stage)
                    # print('selected model id: {}'.format(selected_model_id))
                    next_xs[selected_model_id].append(key)
                    next_ys[selected_model_id].append(y_train[index])

                # prepare data accordingly
                for next_model_id in range(self.num_of_models[stage + 1]):
                    train_datas.append([])
                    if len(next_xs[next_model_id]) != 0:
                        dataset = (next_xs[next_model_id],
                                   next_ys[next_model_id])
                        train_datas[stage + 1].append(dataset)
                    else:
                        # there is no x and y allocated
                        # by default, give it all the training data
                        number_unused_model = number_unused_model + 1

                        # print("[WARN] The model {}-{} is not given any data".
                        #       format(stage + 1, next_model_id))
                        train_datas[stage + 1].append((None, None))
                print("unused model at stage {}: {}".format(
                    stage + 1, number_unused_model))
        end_time = timer()

        y_pred = []
        for each in x_test:
            y_pred.append(self.predict(each))
        mse = metrics.mean_squared_error(y_test, y_pred)
        return mse, end_time - start_time
    def train(self, x_train, y_train, x_test, y_test):

        x_train, y_train, x_test, y_test = self._normalize(
            x_train, y_train, x_test, y_test)
        start_time = timer()
        self.net.fit(x_train, y_train, epochs=self.epochs, batch_size=10)
        end_time = timer()

        y_hat = self.net.predict(x_test)
        mse = metrics.mean_squared_error(y_test, y_hat)
        return mse, end_time - start_time
    def train(self, x_train, y_train, x_test, y_test, dim=2):

        #Build kd tree with train data
        data_train = np.hstack((x_train, y_train))
        data_train = data_train.tolist()
        build_time = self.build_kd_tree(data_train)

        # search points kd tree with test data
        y_predict_test = []
        for key in x_test:
            nearest = self.get_nearest(key, dim=2)
            y_predict_test.append(nearest[1][-1])

        y_predict_test = np.array(y_predict_test)
        mse = mean_squared_error(y_test, y_predict_test)

        return mse, build_time
    def evaluate_knn_query(self, query, ytrue, k):

        if self.debug_print:
            print("[Point Query %d %d]  Evaluating %d neighbours" %
                  (query[0], query[1], k))
        build_times = []
        mses = []
        for idx, model in enumerate(self.models):
            if (model.name == 'Scipy KD-Tree') or (model.name
                                                   == 'Lisa Baseline'):
                mses.append(0)
                build_times.append(0)
                continue
            start_time = timer()
            y_pred = np.array(self.predict_knn_query(idx, query, k))
            end_time = timer()
            ytrue = np.squeeze(ytrue)
            if (y_pred.shape[0] != np.squeeze(ytrue).shape[0]):
                print(
                    f'Num of predicted entries in knn query {np.squeeze(y_pred).shape[0]} versus expected {ytrue.shape[0]} entries'
                )
                mse = -1

            else:
                yhat = np.array(y_pred).reshape(-1, 1)
                mse = metrics.mean_squared_error(yhat, ytrue)

                if (mse != 0):
                    print(yhat)
                    print('\n\n\n\n')
                    print(ytrue)
                    for i in range(ytrue.shape[0]):
                        if (yhat[i] != ytrue[i]):
                            print(' Predicted y %d Expected y %d' %
                                  (yhat[i], ytrue[i]))

            mses.append(mse)
            if self.debug_print:
                print(
                    "{} model tested in {:.4f} seconds with mse {:.4f}".format(
                        model.name, end_time - start_time, mse))
            build_times.append(end_time - start_time)

        return mses, build_times
示例#11
0
    def train(self, x_train, y_train, x_test, y_test):
        self.total_data_size = x_train.shape[0]
        x, y = (list(t) for t in zip(*sorted(zip(x_train, y_train))))
        start_time = timer()
        for i in range(self.total_data_size):
            self.btree.insert(Item(x[i], y[i]))
            print('{}/{} inserted into B-Tree'.format(i, self.total_data_size),
                  end='\r')
        end_time = timer()
        test_data_size = x_test.shape[0]
        pred_y = []
        for i in range(test_data_size):
            pred_y.append(
                self.btree.search(x_test[i])[2].value // self.page_size)
            print('{}/{} tested B-Tree'.format(i, test_data_size), end='\r')

        pred_y = np.array(pred_y)
        mse = metrics.mean_squared_error(y_test, pred_y)
        return mse, end_time - start_time
    def train(self, x_train, y_train, x_test, y_test):

        #Build kd tree with train data
        start_time = timer()
        self.build(x_train)
        end_time = timer()
        build_time = end_time - start_time
        self.y_train = y_train
        self.x_train = x_train

        mse = 0.0
        y_predict_test = []
        # data_test=np.hstack((x_test, y_test))
        for key in x_test:
            pred = self.predict(key)
            y_predict_test.append(pred)
        mse = metrics.mean_squared_error(y_test, y_predict_test)

        return mse, build_time
    def train(self, x_train, y_train, x_test, y_test):

        print(x_train.shape)
        print(x_test.shape)
        print(y_train.shape)
        print(y_test.shape)

        np.set_printoptions(threshold=1000)
        start_time = timer()
        self.train_array = np.hstack((x_train, y_train.reshape(-1, 1),
                                      np.zeros((x_train.shape[0], 1),
                                               dtype=x_train.dtype)))
        self.train_array = self.train_array.astype('float64')
        # Apply mapping function to 2 dimenional key values
        self.mapping_function()

        # Sort the input data array with mapped values
        self.train_array = self.train_array[self.train_array[:, 3].argsort()]
        #self.plot_function(in_data_arr)

        #Init dense array with sorted mapped values(Store first and last key per page)
        if (self.init_dense_array() == -1):
            return -1, timer() - start_time

        end_time = timer()
        print('/n build time %f' % (end_time - start_time))
        test_data_size = x_test.shape[0]
        pred_y = []
        #for i in range(20):
        print('\n In Lisabaseline.build evaluation %d data points' %
              (test_data_size))
        for i in range(test_data_size):
            pred_y.append(self.predict(x_test[i]))

        pred_y = np.array(pred_y)
        mse = metrics.mean_squared_error(y_test, pred_y)
        return mse, end_time - start_time
示例#14
0
 def train(self, x_train, y_train, x_test, y_test):
     x_train, y_train, x_test, y_test = self._normalize(
         x_train, y_train, x_test, y_test)
     # generating X
     data = np.zeros((len(x_train), 2))
     data[:, 0] = x_train[:, 0]
     data[:, 1] = y_train[:, 0]
     model = BFModel(8)
     model.load('./pretrained/bfnet.model')
     start_time = timer()
     predicted_betas = model.predict(data)
     predicted_betas = predicted_betas.reshape(y_train.shape[0], )
     indices = self.find_locations(predicted_betas)
     self.indices = indices
     self.lrs = []
     self.model = pwlf.PiecewiseLinFit(data[:, 0], data[:, 1])
     self.model.fit_with_breaks(indices)
     end_time = timer()
     yhat = []
     for each in x_test:
         yhat.append(self.predict(each))
     yhat = np.array(yhat)
     mse = metrics.mean_squared_error(y_test, yhat)
     return mse, end_time - start_time
    for t in test:
        result.append(tuple(get_knn(kd_tree, t, 5, dim, dist_sq_dim)))

    dis_grnd_truth = sklearn_kdtree(points, dim)

    print(result, "result")
    list_result = []
    print(len(result))
    for i in range(len(result[0])):
        list_result.append(result[0][i][0])

    print(list_result, 'list_result')
    # print(t_end-t_start, "Time taken")
    print(dis_grnd_truth[0], "dis_grnd_truth")
    mse_error = mean_squared_error(dis_grnd_truth[0],
                                   list_result,
                                   squared=False)
    print(mse_error, 'mse_error')

# def bench1():
#     kd_tree = make_kd_tree(points, dim)
#     for point in additional_points:
#         add_point(kd_tree, point, dim)
#     # result1.append(tuple(get_knn(kd_tree, [0] * dim, 1, dim, dist_sq_dim)))
#     for t in test:
#         result1.append(tuple(get_knn(kd_tree, t, 1, dim, dist_sq_dim)))

# def bench2():
#     all_points = points + additional_points
#     result2.append(tuple(get_knn_naive(all_points, [0] * dim, 8, dist_sq_dim)))
#     for t in test:
 def calculate_error(self, X, y, alphas, betas):
     _, A = self._calculate_alphas(betas, X, y)
     yhat = A @ alphas
     mse = mean_squared_error(yhat, y)
     return mse