def tree_learning(args): tree = TreeIndex(args.tree_name, args.tree_path) d = args.gap l_max = tree.height() - 1 l = d pi_new = dict() all_items = [node.id() for node in tree.get_all_leafs()] pi_new = tree.get_pi_relation(all_items, l - d) pi_new_final = mp.Manager().dict() pi_new_final.update(pi_new) del all_items del pi_new while d > 0: print("begin to re-assign {} layer by {} layer.".format(l, l - d)) nodes = tree.get_layer_codes(l - d) real_process_num = mp_run(nodes, 12, process, pi_new_final, tree, l, d, args) d = min(d, l_max - l) l = l + d print(pi_new_final)
def load_tree_info(name, path, topk=200): tree = TreeIndex(name, path) all_codes = [] first_layer_code = None for i in range(tree.height()): layer_codes = tree.get_layer_codes(i) if len(layer_codes) > topk and first_layer_code == None: first_layer_code = layer_codes all_codes += layer_codes all_ids = tree.get_nodes(all_codes) id_code_map = {} code_id_map = {} for i in range(len(all_codes)): id = all_ids[i].id() code = all_codes[i] id_code_map[id] = code code_id_map[code] = id print(len(all_codes), len(all_ids), len(id_code_map), len(code_id_map)) first_layer = tree.get_nodes(first_layer_code) first_layer = [node.id() for node in first_layer] return id_code_map, code_id_map, tree.branch(), first_layer
def test_tree_index(self): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa") ''' path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") ''' tree = TreeIndex("demo", path) height = tree.height() branch = tree.branch() self.assertTrue(height == 5) self.assertTrue(branch == 2) self.assertEqual(tree.total_node_nums(), 25) self.assertEqual(tree.emb_size(), 30) # get_layer_codes layer_node_ids = [] layer_node_codes = [] for i in range(tree.height()): layer_node_codes.append(tree.get_layer_codes(i)) layer_node_ids.append( [node.id() for node in tree.get_nodes(layer_node_codes[-1])]) all_leaf_ids = [node.id() for node in tree.get_all_leafs()] self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1])) # get_travel travel_codes = tree.get_travel_codes(all_leaf_ids[0]) travel_ids = [node.id() for node in tree.get_nodes(travel_codes)] for i in range(height): self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i]) self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i]) # get_ancestor ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2) ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)] self.assertEqual(ancestor_ids[0], travel_ids[1]) self.assertEqual(ancestor_codes[0], travel_codes[1]) # get_pi_relation pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2) self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0]) # get_travel_path travel_path_codes = tree.get_travel_path(travel_codes[0], travel_codes[-1]) travel_path_ids = [ node.id() for node in tree.get_nodes(travel_path_codes) ] self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids) self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes) # get_children children_codes = tree.get_children_codes(travel_codes[1], height - 1) children_ids = [node.id() for node in tree.get_nodes(children_codes)] self.assertIn(all_leaf_ids[0], children_ids)
return np.array(fluid.global_scope().find_var("TDM_Tree_Emb").get_tensor()) if __name__ == '__main__': utils_path = "{}/tools/utils/static_ps".format( os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))) sys.path.append(utils_path) print(utils_path) import common yaml_helper = common.YamlHelper() config = yaml_helper.load_yaml(sys.argv[1]) tree_name = config.get("hyper_parameters.tree_name") tree_path = config.get("hyper_parameters.tree_path") tree_node_num = config.get("hyper_parameters.sparse_feature_num") node_emb_size = config.get("hyper_parameters.node_emb_size") tensor = get_emb_numpy(tree_node_num, node_emb_size, sys.argv[2]) tree = TreeIndex(tree_name, tree_path) all_leafs = tree.get_all_leafs() with open(sys.argv[3], 'w') as fout: for node in all_leafs: node_id = node.id() emb_vec = map(str, tensor[node_id].tolist()) emb_vec = [str(node_id)] + emb_vec fout.write(",".join(emb_vec)) fout.write("\n")
def test_layerwise_sampler(): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") tree = TreeIndex("demo", path) layer_nodes = [] for i in range(tree.height()): layer_codes = tree.get_layer_codes(i) layer_nodes.append( [node.id() for node in tree.get_nodes(layer_codes)]) sample_num = range(1, 10000) start_sample_layer = 1 seed = 0 sample_layers = tree.height() - start_sample_layer sample_num = sample_num[:sample_layers] layer_sample_counts = list(sample_num) + [1] * (sample_layers - len(sample_num)) total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts) tree.init_layerwise_sampler(sample_num, start_sample_layer, seed) ids = [315757, 838060, 1251533, 403522, 2473624, 3321007] parent_path = {} for i in range(len(ids)): tmp = tree.get_travel_codes(ids[i], start_sample_layer) parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)] # check sample res with_hierarchy = False sample_res = tree.layerwise_sample( [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False) idx = 0 layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res[idx + j][0] == 315757 assert sample_res[idx + j][1] == 838060 assert sample_res[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res[idx + j][3] == 1 assert sample_res[idx + j][2] == parent_path[2473624][i] else: assert sample_res[idx + j][3] == 0 assert sample_res[idx + j][2] != parent_path[2473624][i] idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert idx == total_sample_num layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res[idx + j][0] == 1251533 assert sample_res[idx + j][1] == 403522 assert sample_res[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res[idx + j][3] == 1 assert sample_res[idx + j][2] == parent_path[3321007][i] else: assert sample_res[idx + j][3] == 0 assert sample_res[idx + j][2] != parent_path[3321007][i] idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert idx == total_sample_num * 2 # check sample res with_hierarchy = True sample_res_with_hierarchy = tree.layerwise_sample( [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True) idx = 0 layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res_with_hierarchy[idx + j][0] == parent_path[315757][i] assert sample_res_with_hierarchy[idx + j][1] == parent_path[838060][i] assert sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res_with_hierarchy[idx + j][3] == 1 assert sample_res_with_hierarchy[idx + j][2] == parent_path[2473624][i] else: assert (sample_res_with_hierarchy[idx + j][3] == 0) assert (sample_res_with_hierarchy[idx + j][2] != parent_path[2473624][i]) idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert (idx == total_sample_num) layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert (sample_res_with_hierarchy[idx + j][0] == parent_path[1251533][i]) assert (sample_res_with_hierarchy[idx + j][1] == parent_path[403522][i]) assert (sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer]) if j == 0: assert (sample_res_with_hierarchy[idx + j][3] == 1) assert (sample_res_with_hierarchy[idx + j][2] == parent_path[3321007][i]) else: assert (sample_res_with_hierarchy[idx + j][3] == 0) assert (sample_res_with_hierarchy[idx + j][2] != parent_path[3321007][i]) idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert (idx == 2 * total_sample_num)
def test_tree_index(): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") tree = TreeIndex("demo", path) height = tree.height() branch = tree.branch() assert height == 14 print("height is equal 14") assert branch == 2 assert tree.total_node_nums() == 15581 assert tree.emb_size() == 5171136 layer_node_ids = [] layer_node_codes = [] for i in range(tree.height()): layer_node_codes.append(tree.get_layer_codes(i)) layer_node_ids.append( [node.id() for node in tree.get_nodes(layer_node_codes[-1])]) all_leaf_ids = [node.id() for node in tree.get_all_leafs()] assert sum(all_leaf_ids) == sum(layer_node_ids[-1]) # get_travel travel_codes = tree.get_travel_codes(all_leaf_ids[0]) travel_ids = [node.id() for node in tree.get_nodes(travel_codes)] # for i in range(height): # assert travel_ids[i] == layer_node_ids[height - 1 - i] # assert travel_codes[i] == layer_node_codes[height - 1 - i] # get_ancestor ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2) ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)] assert ancestor_ids[0] == travel_ids[1] assert ancestor_codes[0], travel_codes[1] # get_pi_relation pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2) assert pi_relation[all_leaf_ids[0]] == ancestor_codes[0] # get_travel_path travel_path_codes = tree.get_travel_path(travel_codes[0], travel_codes[-1]) travel_path_ids = [ node.id() for node in tree.get_nodes(travel_path_codes) ] assert travel_path_ids + [travel_ids[-1]] == travel_ids assert travel_path_codes + [travel_codes[-1]] == travel_codes # get_children children_codes = tree.get_children_codes(travel_codes[1], height - 1) children_ids = [node.id() for node in tree.get_nodes(children_codes)]