Exemplo n.º 1
0
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        eggroll.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = eggroll.parallelize(data_insts, include_key=False)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]
Exemplo n.º 2
0
 def get_histograms(self, node_map={}):
     LOGGER.info("start to get node histograms")
     histograms = FeatureHistogram.calculate_histogram(
         self.data_bin_with_node_dispatch, self.grad_and_hess,
         self.bin_split_points, self.bin_sparse_points, self.valid_features,
         node_map)
     acc_histograms = FeatureHistogram.accumulate_histogram(histograms)
     return acc_histograms
Exemplo n.º 3
0
 def get_histograms(self, node_map={}):
     LOGGER.info("start to get node histograms")
     histograms = FeatureHistogram.calculate_histogram(
         self.data_bin_with_position, self.grad_and_hess,
         self.bin_split_points, self.bin_sparse_points, self.valid_features,
         node_map, self.use_missing, self.zero_as_missing)
     LOGGER.info("begin to accumulate histograms")
     acc_histograms = FeatureHistogram.accumulate_histogram(histograms)
     LOGGER.info("acc histogram shape is {}".format(len(acc_histograms)))
     return acc_histograms
Exemplo n.º 4
0
 def get_histograms(self, node_map={}):
     LOGGER.info("start to get node histograms")
     # self.data_bin_with_position = self.data_bin.join(node_positions, lambda v1, v2: (v1, v2))
     histograms = FeatureHistogram.calculate_histogram(
         self.data_bin_with_position, self.grad_and_hess,
         self.bin_split_points, self.bin_sparse_points, self.valid_features,
         node_map)
     LOGGER.info("begin to accumulate histograms")
     acc_histograms = FeatureHistogram.accumulate_histogram(histograms)
     LOGGER.info("acc histogram shape is {}".format(len(acc_histograms)))
     return acc_histograms
    def get_left_node_local_histogram(self, cur_nodes: List[Node], tree: List[Node], g_h, table_with_assign,
                            split_points, sparse_point, valid_feature):

        node_map = self.get_node_map(cur_nodes, left_node_only=True)

        LOGGER.info("start to get node histograms")
        histograms = FeatureHistogram.calculate_histogram(
            table_with_assign, g_h,
            split_points, sparse_point,
            valid_feature, node_map,
            self.use_missing, self.zero_as_missing)

        hist_bags = []
        for hist_list in histograms:
            hist_bags.append(HistogramBag(hist_list))

        left_nodes = []
        for node in cur_nodes:
            if node.is_left_node or node.id == 0:
                left_nodes.append(node)

        # set histogram id and parent histogram id
        for node, hist_bag in zip(left_nodes, hist_bags):
            # LOGGER.debug('node id {}, node parent id {}, cur tree {}'.format(node.id, node.parent_nodeid, len(tree)))
            hist_bag.hid = node.id
            hist_bag.p_hid = node.parent_nodeid

        return hist_bags
    def get_histograms(self, node_map={}):
        LOGGER.info("start to get node histograms")
        acc_histograms = FeatureHistogram.calculate_histogram(
            self.data_bin_with_position,
            self.grad_and_hess,
            self.bin_split_points,
            self.bin_sparse_points,
            self.valid_features,
            node_map,
            self.use_missing,
            self.zero_as_missing,
            ret="tb")

        return acc_histograms
    def get_local_histogram(self, cur_to_split: List[Node], g_h, table_with_assign,
                            split_points, sparse_point, valid_feature):
        LOGGER.info("start to get node histograms")
        node_map = self.get_node_map(nodes=cur_to_split)
        histograms = FeatureHistogram.calculate_histogram(
            table_with_assign, g_h,
            split_points, sparse_point,
            valid_feature, node_map,
            self.use_missing, self.zero_as_missing)

        hist_bags = []
        for hist_list in histograms:
            hist_bags.append(HistogramBag(hist_list))

        return hist_bags
Exemplo n.º 8
0
class TestFeatureHistogram(unittest.TestCase):
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        eggroll.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = eggroll.parallelize(data_insts, include_key=False)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]

    def test_accumulate_histogram(self):
        data = [[[[random.randint(0, 10) for i in range(2)]
                  for j in range(3)]
                 for k in range(4)]
                for r in range(5)]
        histograms = self.feature_histogram.accumulate_histogram(copy.deepcopy(data))
        for i in range(len(data)):
            for j in range(len(data[i])):
                for k in range(1, len(data[i][j])):
                    for r in range(len(data[i][j][k])):
                        data[i][j][k][r] += data[i][j][k - 1][r]
                        self.assertTrue(data[i][j][k][r] == histograms[i][j][k][r])

    def test_calculate_histogram(self):
        histograms = self.feature_histogram.calculate_histogram(
            self.data_bin, self.grad_and_hess,
            self.bin_split_points, self.bin_sparse,
            node_map=self.node_map)

        his2 = [[[[0 for i in range(3)]
                  for j in range(6)]
                 for k in range(10)]
                for r in range(4)]
        for i in range(1000):
            grad, hess = self.grad_and_hess_list[i]
            id = self.node_map[self.data_insts[i][1][1]]
            for fid, bid in self.data_insts[i][0].features.get_all_data():
                his2[id][fid][bid][0] += grad
                his2[id][fid][bid][1] += hess
                his2[id][fid][bid][2] += 1

        for i in range(len(his2)):
            for j in range(len(his2[i])):
                for k in range(len(his2[i][j])):
                    for r in range(len(his2[i][j][k])):
                        self.assertTrue(np.fabs(his2[i][j][k][r] - histograms[i][j][k][r]) < consts.FLOAT_ZERO)

    def test_aggregate_histogram(self):
        data1 = [[[[random.randint(0, 10) for i in range(2)]
                   for j in range(3)]
                  for k in range(4)]
                 for r in range(5)]

        data2 = [[[[random.randint(0, 10) for i in range(2)]
                   for j in range(3)]
                  for k in range(4)]
                 for r in range(5)]

        agg_histograms = self.feature_histogram.aggregate_histogram(data1, data2)
        for i in range(len(data1)):
            for j in range(len(data1[i])):
                for k in range(len(data1[i][j])):
                    for r in range(len(data1[i][j][k])):
                        data1[i][j][k][r] += data2[i][j][k][r]
                        self.assertTrue(data1[i][j][k][r] == agg_histograms[i][j][k][r])