def download_data(self, data_urls, data_md5s, folder_name, is_model=True): data_cache_folder = os.path.join(self.cache_folder, folder_name) zip_path = '' if os.environ.get('DATASET') == 'full': file_names = [] for i in range(0, len(data_urls)): download(data_urls[i], self.int8_download, data_md5s[i]) file_names.append(data_urls[i].split('/')[-1]) zip_path = os.path.join(self.cache_folder, 'full_imagenet_val.tar.gz') if not os.path.exists(zip_path): cat_command = 'cat' for file_name in file_names: cat_command += ' ' + os.path.join(self.cache_folder, file_name) cat_command += ' > ' + zip_path os.system(cat_command) if os.environ.get('DATASET') != 'full' or is_model: download(data_urls[0], self.int8_download, data_md5s[0]) file_name = data_urls[0].split('/')[-1] zip_path = os.path.join(self.cache_folder, file_name) print('Data is downloaded at {0}'.format(zip_path)) self.cache_unzipping(data_cache_folder, zip_path) return data_cache_folder
def download_model(self, data_url, data_md5, folder_name): download(data_url, self.download_path, data_md5) file_name = data_url.split('/')[-1] zip_path = os.path.join(self.cache_folder, file_name) print('Data is downloaded at {0}'.format(zip_path)) data_cache_folder = os.path.join(self.cache_folder, folder_name) self.cache_unzipping(data_cache_folder, zip_path) return data_cache_folder
def download_model(self, model_name, data_url, data_md5): download(data_url, self.weight_quantization_dir, data_md5) file_name = data_url.split('/')[-1] file_path = os.path.join(self.cache_folder, file_name) print(model_name + ' is downloaded at ' + file_path) unziped_path = os.path.join(self.cache_folder, model_name) self.cache_unzipping(unziped_path, file_path) print(model_name + ' is unziped at ' + unziped_path) return unziped_path
def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path): print("Downloading pascalvcoc test set...") download(data_url, data_dir, tar_targethash) if not os.path.exists(tar_path): print("Failed in downloading pascalvoc test set. URL %s\n" % data_url) else: tmp_hash = hashlib.md5(open(tar_path, 'rb').read()).hexdigest() if tmp_hash != tar_targethash: print("Downloaded test set is broken, removing ...\n") else: print("Downloaded successfully. Path: %s\n" % tar_path)
def download_decompress_file(data_dir, url, md5): logger.info("Downloading from {}".format(url)) tar_file = download(url, data_dir, md5) logger.info("Decompressing {}".format(tar_file)) with tarfile.open(tar_file) as tf: tf.extractall(path=data_dir) os.remove(tar_file)
def download_decompress_file(data_dir, url, md5): logger.info("Downloading from {}".format(url)) zip_file = download(url, data_dir, md5) logger.info("Decompressing {}".format(zip_file)) with zipfile.ZipFile(zip_file) as zf: zf.extractall(path=data_dir) os.remove(zip_file)
def test_download_url(self): LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' catch_exp = False try: download(LABEL_URL, 'flowers', LABEL_MD5) except Exception as e: catch_exp = True self.assertTrue(catch_exp == False) file_path = DATA_HOME + "/flowers/imagelabels.mat" self.assertTrue(os.path.exists(file_path)) self.assertTrue(md5file(file_path), LABEL_MD5)
def download_files(self): path = download(self.proto_data_url, self.module_name, self.proto_data_md5) print('data is downloaded at ' + path) tar = tarfile.open(path) unzip_folder = tempfile.mkdtemp() tar.extractall(unzip_folder) return unzip_folder
def download_concat(cache_folder, zip_path): data_urls = [] data_md5s = [] data_urls.append( 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa' ) data_md5s.append('60f6525b0e1d127f345641d75d41f0a8') data_urls.append( 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab' ) data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5') file_names = [] print("Downloading full ImageNet Validation dataset ...") for i in range(0, len(data_urls)): download(data_urls[i], cache_folder, data_md5s[i]) file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1]) file_names.append(file_name) print("Downloaded part {0}\n".format(file_name)) with open(zip_path, "wb") as outfile: for fname in file_names: shutil.copyfileobj(open(fname, 'rb'), outfile)
def test_tree_index(): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") tree = TreeIndex("demo", path) height = tree.height() branch = tree.branch() assert height == 14 print("height is equal 14") assert branch == 2 assert tree.total_node_nums() == 15581 assert tree.emb_size() == 5171136 layer_node_ids = [] layer_node_codes = [] for i in range(tree.height()): layer_node_codes.append(tree.get_layer_codes(i)) layer_node_ids.append( [node.id() for node in tree.get_nodes(layer_node_codes[-1])]) all_leaf_ids = [node.id() for node in tree.get_all_leafs()] assert sum(all_leaf_ids) == sum(layer_node_ids[-1]) # get_travel travel_codes = tree.get_travel_codes(all_leaf_ids[0]) travel_ids = [node.id() for node in tree.get_nodes(travel_codes)] # for i in range(height): # assert travel_ids[i] == layer_node_ids[height - 1 - i] # assert travel_codes[i] == layer_node_codes[height - 1 - i] # get_ancestor ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2) ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)] assert ancestor_ids[0] == travel_ids[1] assert ancestor_codes[0], travel_codes[1] # get_pi_relation pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2) assert pi_relation[all_leaf_ids[0]] == ancestor_codes[0] # get_travel_path travel_path_codes = tree.get_travel_path(travel_codes[0], travel_codes[-1]) travel_path_ids = [ node.id() for node in tree.get_nodes(travel_path_codes) ] assert travel_path_ids + [travel_ids[-1]] == travel_ids assert travel_path_codes + [travel_codes[-1]] == travel_codes # get_children children_codes = tree.get_children_codes(travel_codes[1], height - 1) children_ids = [node.id() for node in tree.get_nodes(children_codes)]
def _setup_config(self): self._mode = "async" self._reader = "pyreader" self._need_test = 1 data_url = "https://fleet.bj.bcebos.com/unittest/ctr_saved_params.tar.gz" data_md5 = "aa7e8286ced566ea8a67410be7482438" module_name = "ctr_saved_params" path = download(data_url, module_name, data_md5) print('ctr_params is downloaded at ' + path) tar = tarfile.open(path) unzip_folder = tempfile.mkdtemp() tar.extractall(unzip_folder) self._model_dir = unzip_folder
def test_layerwise_sampler(self): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa") tdm_layer_counts = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] #tree = TreeIndex("demo", path) file_name = "test_in_memory_dataset_tdm_sample_run.txt" with open(file_name, "w") as f: #data = "29 d 29 d 29 29 29 29 29 29 29 29 29 29 29 29\n" data = "1 1 1 15 15 15\n" data += "1 1 1 15 15 15\n" f.write(data) slots = ["slot1", "slot2", "slot3"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="int64") slots_vars.append(var) dataset = paddle.distributed.InMemoryDataset() dataset.init(batch_size=1, pipe_command="cat", download_cmd="cat", use_var=slots_vars) dataset.set_filelist([file_name]) #dataset.update_settings(pipe_command="cat") #dataset._init_distributed_settings( # parse_ins_id=True, # parse_content=True, # fea_eval=True, # candidate_size=10000) dataset.load_into_memory() dataset.tdm_sample('demo', tree_path=path, tdm_layer_counts=tdm_layer_counts, start_sample_layer=1, with_hierachy=False, seed=0, id_slot=2) self.assertTrue(dataset.get_shuffle_data_size() == 8)
def train(): """ Create a train dataset reader containing 2913 images in HWC order. """ return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
def test_layerwise_sampler(): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") tree = TreeIndex("demo", path) layer_nodes = [] for i in range(tree.height()): layer_codes = tree.get_layer_codes(i) layer_nodes.append( [node.id() for node in tree.get_nodes(layer_codes)]) sample_num = range(1, 10000) start_sample_layer = 1 seed = 0 sample_layers = tree.height() - start_sample_layer sample_num = sample_num[:sample_layers] layer_sample_counts = list(sample_num) + [1] * (sample_layers - len(sample_num)) total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts) tree.init_layerwise_sampler(sample_num, start_sample_layer, seed) ids = [315757, 838060, 1251533, 403522, 2473624, 3321007] parent_path = {} for i in range(len(ids)): tmp = tree.get_travel_codes(ids[i], start_sample_layer) parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)] # check sample res with_hierarchy = False sample_res = tree.layerwise_sample( [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False) idx = 0 layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res[idx + j][0] == 315757 assert sample_res[idx + j][1] == 838060 assert sample_res[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res[idx + j][3] == 1 assert sample_res[idx + j][2] == parent_path[2473624][i] else: assert sample_res[idx + j][3] == 0 assert sample_res[idx + j][2] != parent_path[2473624][i] idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert idx == total_sample_num layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res[idx + j][0] == 1251533 assert sample_res[idx + j][1] == 403522 assert sample_res[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res[idx + j][3] == 1 assert sample_res[idx + j][2] == parent_path[3321007][i] else: assert sample_res[idx + j][3] == 0 assert sample_res[idx + j][2] != parent_path[3321007][i] idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert idx == total_sample_num * 2 # check sample res with_hierarchy = True sample_res_with_hierarchy = tree.layerwise_sample( [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True) idx = 0 layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert sample_res_with_hierarchy[idx + j][0] == parent_path[315757][i] assert sample_res_with_hierarchy[idx + j][1] == parent_path[838060][i] assert sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer] if j == 0: assert sample_res_with_hierarchy[idx + j][3] == 1 assert sample_res_with_hierarchy[idx + j][2] == parent_path[2473624][i] else: assert (sample_res_with_hierarchy[idx + j][3] == 0) assert (sample_res_with_hierarchy[idx + j][2] != parent_path[2473624][i]) idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert (idx == total_sample_num) layer = tree.height() - 1 for i in range(len(layer_sample_counts)): for j in range(layer_sample_counts[0 - (i + 1)] + 1): assert (sample_res_with_hierarchy[idx + j][0] == parent_path[1251533][i]) assert (sample_res_with_hierarchy[idx + j][1] == parent_path[403522][i]) assert (sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer]) if j == 0: assert (sample_res_with_hierarchy[idx + j][3] == 1) assert (sample_res_with_hierarchy[idx + j][2] == parent_path[3321007][i]) else: assert (sample_res_with_hierarchy[idx + j][3] == 0) assert (sample_res_with_hierarchy[idx + j][2] != parent_path[3321007][i]) idx += layer_sample_counts[0 - (i + 1)] + 1 layer -= 1 assert (idx == 2 * total_sample_num)
def test_tree_index(self): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa") ''' path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a") ''' tree = TreeIndex("demo", path) height = tree.height() branch = tree.branch() self.assertTrue(height == 5) self.assertTrue(branch == 2) self.assertEqual(tree.total_node_nums(), 25) self.assertEqual(tree.emb_size(), 30) # get_layer_codes layer_node_ids = [] layer_node_codes = [] for i in range(tree.height()): layer_node_codes.append(tree.get_layer_codes(i)) layer_node_ids.append( [node.id() for node in tree.get_nodes(layer_node_codes[-1])]) all_leaf_ids = [node.id() for node in tree.get_all_leafs()] self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1])) # get_travel travel_codes = tree.get_travel_codes(all_leaf_ids[0]) travel_ids = [node.id() for node in tree.get_nodes(travel_codes)] for i in range(height): self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i]) self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i]) # get_ancestor ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2) ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)] self.assertEqual(ancestor_ids[0], travel_ids[1]) self.assertEqual(ancestor_codes[0], travel_codes[1]) # get_pi_relation pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2) self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0]) # get_travel_path travel_path_codes = tree.get_travel_path(travel_codes[0], travel_codes[-1]) travel_path_ids = [ node.id() for node in tree.get_nodes(travel_path_codes) ] self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids) self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes) # get_children children_codes = tree.get_children_codes(travel_codes[1], height - 1) children_ids = [node.id() for node in tree.get_nodes(children_codes)] self.assertIn(all_leaf_ids[0], children_ids)
def test(): """ Create a test dataset reader containing 1464 images in HWC order. """ return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
def val(): """ Create a val dataset reader containing 1449 images in HWC order. """ return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')