示例#1
0
def tree_learning(args):
    tree = TreeIndex(args.tree_name, args.tree_path)
    d = args.gap

    l_max = tree.height() - 1
    l = d

    pi_new = dict()

    all_items = [node.id() for node in tree.get_all_leafs()]
    pi_new = tree.get_pi_relation(all_items, l - d)

    pi_new_final = mp.Manager().dict()
    pi_new_final.update(pi_new)

    del all_items
    del pi_new

    while d > 0:
        print("begin to re-assign {} layer by {} layer.".format(l, l - d))
        nodes = tree.get_layer_codes(l - d)
        real_process_num = mp_run(nodes, 12, process, pi_new_final, tree, l, d,
                                  args)
        d = min(d, l_max - l)
        l = l + d
    print(pi_new_final)
示例#2
0
def load_tree_info(name, path, topk=200):
    tree = TreeIndex(name, path)
    all_codes = []
    first_layer_code = None
    for i in range(tree.height()):
        layer_codes = tree.get_layer_codes(i)
        if len(layer_codes) > topk and first_layer_code == None:
            first_layer_code = layer_codes
        all_codes += layer_codes
    all_ids = tree.get_nodes(all_codes)
    id_code_map = {}
    code_id_map = {}
    for i in range(len(all_codes)):
        id = all_ids[i].id()
        code = all_codes[i]
        id_code_map[id] = code
        code_id_map[code] = id
    print(len(all_codes), len(all_ids), len(id_code_map), len(code_id_map))

    first_layer = tree.get_nodes(first_layer_code)
    first_layer = [node.id() for node in first_layer]

    return id_code_map, code_id_map, tree.branch(), first_layer
    def test_tree_index(self):
        path = download(
            "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
            "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa")
        '''
        path = download(
            "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
        '''
        tree = TreeIndex("demo", path)
        height = tree.height()
        branch = tree.branch()
        self.assertTrue(height == 5)
        self.assertTrue(branch == 2)
        self.assertEqual(tree.total_node_nums(), 25)
        self.assertEqual(tree.emb_size(), 30)

        # get_layer_codes
        layer_node_ids = []
        layer_node_codes = []
        for i in range(tree.height()):
            layer_node_codes.append(tree.get_layer_codes(i))
            layer_node_ids.append(
                [node.id() for node in tree.get_nodes(layer_node_codes[-1])])

        all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
        self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1]))

        # get_travel
        travel_codes = tree.get_travel_codes(all_leaf_ids[0])
        travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]

        for i in range(height):
            self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i])
            self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i])

        # get_ancestor
        ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
        ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]

        self.assertEqual(ancestor_ids[0], travel_ids[1])
        self.assertEqual(ancestor_codes[0], travel_codes[1])

        # get_pi_relation
        pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
        self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0])

        # get_travel_path
        travel_path_codes = tree.get_travel_path(travel_codes[0],
                                                 travel_codes[-1])
        travel_path_ids = [
            node.id() for node in tree.get_nodes(travel_path_codes)
        ]

        self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids)
        self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes)

        # get_children
        children_codes = tree.get_children_codes(travel_codes[1], height - 1)
        children_ids = [node.id() for node in tree.get_nodes(children_codes)]
        self.assertIn(all_leaf_ids[0], children_ids)
示例#4
0
    return np.array(fluid.global_scope().find_var("TDM_Tree_Emb").get_tensor())


if __name__ == '__main__':
    utils_path = "{}/tools/utils/static_ps".format(
        os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))
    sys.path.append(utils_path)
    print(utils_path)
    import common

    yaml_helper = common.YamlHelper()
    config = yaml_helper.load_yaml(sys.argv[1])

    tree_name = config.get("hyper_parameters.tree_name")
    tree_path = config.get("hyper_parameters.tree_path")
    tree_node_num = config.get("hyper_parameters.sparse_feature_num")
    node_emb_size = config.get("hyper_parameters.node_emb_size")

    tensor = get_emb_numpy(tree_node_num, node_emb_size, sys.argv[2])

    tree = TreeIndex(tree_name, tree_path)
    all_leafs = tree.get_all_leafs()

    with open(sys.argv[3], 'w') as fout:
        for node in all_leafs:
            node_id = node.id()
            emb_vec = map(str, tensor[node_id].tolist())
            emb_vec = [str(node_id)] + emb_vec
            fout.write(",".join(emb_vec))
            fout.write("\n")
def test_layerwise_sampler():
        path = download(
                "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
                "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
        tree = TreeIndex("demo", path)

        layer_nodes = []
        for i in range(tree.height()):
            layer_codes = tree.get_layer_codes(i)
            layer_nodes.append(
                [node.id() for node in tree.get_nodes(layer_codes)])

        sample_num = range(1, 10000)
        start_sample_layer = 1
        seed = 0
        sample_layers = tree.height() - start_sample_layer
        sample_num = sample_num[:sample_layers]
        layer_sample_counts = list(sample_num) + [1] * (sample_layers -
                                                        len(sample_num))
        total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts)
        tree.init_layerwise_sampler(sample_num, start_sample_layer, seed)

        ids = [315757, 838060, 1251533, 403522, 2473624, 3321007]
        parent_path = {}
        for i in range(len(ids)):
            tmp = tree.get_travel_codes(ids[i], start_sample_layer)
            parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)]

        # check sample res with_hierarchy = False
        sample_res = tree.layerwise_sample(
            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False)
        idx = 0
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res[idx + j][0] == 315757
                assert sample_res[idx + j][1] == 838060
                assert sample_res[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res[idx + j][3] == 1
                    assert sample_res[idx + j][2] == parent_path[2473624][i]
                else:
                    assert sample_res[idx + j][3] == 0
                    assert sample_res[idx + j][2] != parent_path[2473624][i]
            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert idx == total_sample_num
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res[idx + j][0] == 1251533
                assert sample_res[idx + j][1] == 403522
                assert sample_res[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res[idx + j][3] == 1
                    assert sample_res[idx + j][2] == parent_path[3321007][i]
                else:
                    assert sample_res[idx + j][3] == 0
                    assert sample_res[idx + j][2] != parent_path[3321007][i]
            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert idx == total_sample_num * 2

        # check sample res with_hierarchy = True
        sample_res_with_hierarchy = tree.layerwise_sample(
            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True)
        idx = 0
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res_with_hierarchy[idx + j][0] == parent_path[315757][i]
                assert sample_res_with_hierarchy[idx + j][1] == parent_path[838060][i]
                assert sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res_with_hierarchy[idx + j][3] == 1
                    assert sample_res_with_hierarchy[idx + j][2] == parent_path[2473624][i]
                else:
                    assert (sample_res_with_hierarchy[idx + j][3] == 0)
                    assert (sample_res_with_hierarchy[idx + j][2] != parent_path[2473624][i])

            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert (idx == total_sample_num)
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert (sample_res_with_hierarchy[idx + j][0] == parent_path[1251533][i])
                assert (sample_res_with_hierarchy[idx + j][1] == parent_path[403522][i])
                assert (sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
                if j == 0:
                    assert (sample_res_with_hierarchy[idx + j][3] == 1)
                    assert (sample_res_with_hierarchy[idx + j][2] ==
                                    parent_path[3321007][i])
                else:
                    assert (sample_res_with_hierarchy[idx + j][3] == 0)
                    assert (sample_res_with_hierarchy[idx + j][2] != parent_path[3321007][i])

            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert (idx == 2 * total_sample_num)
def test_tree_index():
    path = download(
        "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
        "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
    tree = TreeIndex("demo", path)
    height = tree.height()
    branch = tree.branch()
    assert height == 14
    print("height is equal 14")
    assert branch == 2
    assert tree.total_node_nums() == 15581
    assert tree.emb_size() == 5171136
    layer_node_ids = []
    layer_node_codes = []
    for i in range(tree.height()):
        layer_node_codes.append(tree.get_layer_codes(i))
        layer_node_ids.append(
            [node.id() for node in tree.get_nodes(layer_node_codes[-1])])

    all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
    assert sum(all_leaf_ids) == sum(layer_node_ids[-1])
    # get_travel
    travel_codes = tree.get_travel_codes(all_leaf_ids[0])
    travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]

#    for i in range(height):
#        assert travel_ids[i] == layer_node_ids[height - 1 - i]
#        assert travel_codes[i] == layer_node_codes[height - 1 - i]

    # get_ancestor
    ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
    ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]

    assert ancestor_ids[0] == travel_ids[1]
    assert ancestor_codes[0], travel_codes[1]

    # get_pi_relation
    pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
    assert pi_relation[all_leaf_ids[0]] == ancestor_codes[0]

    # get_travel_path
    travel_path_codes = tree.get_travel_path(travel_codes[0],
                                             travel_codes[-1])
    travel_path_ids = [
        node.id() for node in tree.get_nodes(travel_path_codes)
    ]

    assert travel_path_ids + [travel_ids[-1]] == travel_ids
    assert travel_path_codes + [travel_codes[-1]] == travel_codes

    # get_children
    children_codes = tree.get_children_codes(travel_codes[1], height - 1)
    children_ids = [node.id() for node in tree.get_nodes(children_codes)]