Пример #1
0
    def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
        data_cache_folder = os.path.join(self.cache_folder, folder_name)
        zip_path = ''
        if os.environ.get('DATASET') == 'full':
            file_names = []
            for i in range(0, len(data_urls)):
                download(data_urls[i], self.int8_download, data_md5s[i])
                file_names.append(data_urls[i].split('/')[-1])

            zip_path = os.path.join(self.cache_folder,
                                    'full_imagenet_val.tar.gz')
            if not os.path.exists(zip_path):
                cat_command = 'cat'
                for file_name in file_names:
                    cat_command += ' ' + os.path.join(self.cache_folder,
                                                      file_name)
                cat_command += ' > ' + zip_path
                os.system(cat_command)

        if os.environ.get('DATASET') != 'full' or is_model:
            download(data_urls[0], self.int8_download, data_md5s[0])
            file_name = data_urls[0].split('/')[-1]
            zip_path = os.path.join(self.cache_folder, file_name)

        print('Data is downloaded at {0}'.format(zip_path))
        self.cache_unzipping(data_cache_folder, zip_path)
        return data_cache_folder
    def download_model(self, data_url, data_md5, folder_name):
        download(data_url, self.download_path, data_md5)
        file_name = data_url.split('/')[-1]
        zip_path = os.path.join(self.cache_folder, file_name)
        print('Data is downloaded at {0}'.format(zip_path))

        data_cache_folder = os.path.join(self.cache_folder, folder_name)
        self.cache_unzipping(data_cache_folder, zip_path)
        return data_cache_folder
Пример #3
0
    def download_model(self, model_name, data_url, data_md5):
        download(data_url, self.weight_quantization_dir, data_md5)
        file_name = data_url.split('/')[-1]
        file_path = os.path.join(self.cache_folder, file_name)
        print(model_name + ' is downloaded at ' + file_path)

        unziped_path = os.path.join(self.cache_folder, model_name)
        self.cache_unzipping(unziped_path, file_path)
        print(model_name + ' is unziped at ' + unziped_path)
        return unziped_path
def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path):
    print("Downloading pascalvcoc test set...")
    download(data_url, data_dir, tar_targethash)
    if not os.path.exists(tar_path):
        print("Failed in downloading pascalvoc test set. URL %s\n" % data_url)
    else:
        tmp_hash = hashlib.md5(open(tar_path, 'rb').read()).hexdigest()
        if tmp_hash != tar_targethash:
            print("Downloaded test set is broken, removing ...\n")
        else:
            print("Downloaded successfully. Path: %s\n" % tar_path)
Пример #5
0
def download_decompress_file(data_dir, url, md5):
    logger.info("Downloading from {}".format(url))
    tar_file = download(url, data_dir, md5)
    logger.info("Decompressing {}".format(tar_file))
    with tarfile.open(tar_file) as tf:
        tf.extractall(path=data_dir)
    os.remove(tar_file)
Пример #6
0
def download_decompress_file(data_dir, url, md5):
    logger.info("Downloading from {}".format(url))
    zip_file = download(url, data_dir, md5)
    logger.info("Decompressing {}".format(zip_file))
    with zipfile.ZipFile(zip_file) as zf:
        zf.extractall(path=data_dir)
    os.remove(zip_file)
Пример #7
0
    def test_download_url(self):
        LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
        LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'

        catch_exp = False
        try:
            download(LABEL_URL, 'flowers', LABEL_MD5)
        except Exception as e:
            catch_exp = True

        self.assertTrue(catch_exp == False)

        file_path = DATA_HOME + "/flowers/imagelabels.mat"

        self.assertTrue(os.path.exists(file_path))
        self.assertTrue(md5file(file_path), LABEL_MD5)
Пример #8
0
 def download_files(self):
     path = download(self.proto_data_url, self.module_name,
                     self.proto_data_md5)
     print('data is downloaded at ' + path)
     tar = tarfile.open(path)
     unzip_folder = tempfile.mkdtemp()
     tar.extractall(unzip_folder)
     return unzip_folder
def download_concat(cache_folder, zip_path):
    data_urls = []
    data_md5s = []
    data_urls.append(
        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
    )
    data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
    data_urls.append(
        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
    )
    data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
    file_names = []
    print("Downloading full ImageNet Validation dataset ...")
    for i in range(0, len(data_urls)):
        download(data_urls[i], cache_folder, data_md5s[i])
        file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1])
        file_names.append(file_name)
        print("Downloaded part {0}\n".format(file_name))
    with open(zip_path, "wb") as outfile:
        for fname in file_names:
            shutil.copyfileobj(open(fname, 'rb'), outfile)
Пример #10
0
def test_tree_index():
    path = download(
        "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
        "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
    tree = TreeIndex("demo", path)
    height = tree.height()
    branch = tree.branch()
    assert height == 14
    print("height is equal 14")
    assert branch == 2
    assert tree.total_node_nums() == 15581
    assert tree.emb_size() == 5171136
    layer_node_ids = []
    layer_node_codes = []
    for i in range(tree.height()):
        layer_node_codes.append(tree.get_layer_codes(i))
        layer_node_ids.append(
            [node.id() for node in tree.get_nodes(layer_node_codes[-1])])

    all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
    assert sum(all_leaf_ids) == sum(layer_node_ids[-1])
    # get_travel
    travel_codes = tree.get_travel_codes(all_leaf_ids[0])
    travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]

#    for i in range(height):
#        assert travel_ids[i] == layer_node_ids[height - 1 - i]
#        assert travel_codes[i] == layer_node_codes[height - 1 - i]

    # get_ancestor
    ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
    ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]

    assert ancestor_ids[0] == travel_ids[1]
    assert ancestor_codes[0], travel_codes[1]

    # get_pi_relation
    pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
    assert pi_relation[all_leaf_ids[0]] == ancestor_codes[0]

    # get_travel_path
    travel_path_codes = tree.get_travel_path(travel_codes[0],
                                             travel_codes[-1])
    travel_path_ids = [
        node.id() for node in tree.get_nodes(travel_path_codes)
    ]

    assert travel_path_ids + [travel_ids[-1]] == travel_ids
    assert travel_path_codes + [travel_codes[-1]] == travel_codes

    # get_children
    children_codes = tree.get_children_codes(travel_codes[1], height - 1)
    children_ids = [node.id() for node in tree.get_nodes(children_codes)]
Пример #11
0
    def _setup_config(self):
        self._mode = "async"
        self._reader = "pyreader"
        self._need_test = 1

        data_url = "https://fleet.bj.bcebos.com/unittest/ctr_saved_params.tar.gz"
        data_md5 = "aa7e8286ced566ea8a67410be7482438"
        module_name = "ctr_saved_params"
        path = download(data_url, module_name, data_md5)
        print('ctr_params is downloaded at ' + path)
        tar = tarfile.open(path)
        unzip_folder = tempfile.mkdtemp()
        tar.extractall(unzip_folder)
        self._model_dir = unzip_folder
Пример #12
0
    def test_layerwise_sampler(self):
        path = download(
            "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
            "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa")

        tdm_layer_counts = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #tree = TreeIndex("demo", path)
        file_name = "test_in_memory_dataset_tdm_sample_run.txt"
        with open(file_name, "w") as f:
            #data = "29 d 29 d 29 29 29 29 29 29 29 29 29 29 29 29\n"
            data = "1 1 1 15 15 15\n"
            data += "1 1 1 15 15 15\n"
            f.write(data)

        slots = ["slot1", "slot2", "slot3"]
        slots_vars = []
        for slot in slots:
            var = fluid.layers.data(name=slot, shape=[1], dtype="int64")
            slots_vars.append(var)

        dataset = paddle.distributed.InMemoryDataset()
        dataset.init(batch_size=1,
                     pipe_command="cat",
                     download_cmd="cat",
                     use_var=slots_vars)
        dataset.set_filelist([file_name])
        #dataset.update_settings(pipe_command="cat")
        #dataset._init_distributed_settings(
        #    parse_ins_id=True,
        #    parse_content=True,
        #    fea_eval=True,
        #    candidate_size=10000)

        dataset.load_into_memory()
        dataset.tdm_sample('demo',
                           tree_path=path,
                           tdm_layer_counts=tdm_layer_counts,
                           start_sample_layer=1,
                           with_hierachy=False,
                           seed=0,
                           id_slot=2)
        self.assertTrue(dataset.get_shuffle_data_size() == 8)
Пример #13
0
def train():
    """
    Create a train dataset reader containing 2913 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
Пример #14
0
def test_layerwise_sampler():
        path = download(
                "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
                "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
        tree = TreeIndex("demo", path)

        layer_nodes = []
        for i in range(tree.height()):
            layer_codes = tree.get_layer_codes(i)
            layer_nodes.append(
                [node.id() for node in tree.get_nodes(layer_codes)])

        sample_num = range(1, 10000)
        start_sample_layer = 1
        seed = 0
        sample_layers = tree.height() - start_sample_layer
        sample_num = sample_num[:sample_layers]
        layer_sample_counts = list(sample_num) + [1] * (sample_layers -
                                                        len(sample_num))
        total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts)
        tree.init_layerwise_sampler(sample_num, start_sample_layer, seed)

        ids = [315757, 838060, 1251533, 403522, 2473624, 3321007]
        parent_path = {}
        for i in range(len(ids)):
            tmp = tree.get_travel_codes(ids[i], start_sample_layer)
            parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)]

        # check sample res with_hierarchy = False
        sample_res = tree.layerwise_sample(
            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False)
        idx = 0
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res[idx + j][0] == 315757
                assert sample_res[idx + j][1] == 838060
                assert sample_res[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res[idx + j][3] == 1
                    assert sample_res[idx + j][2] == parent_path[2473624][i]
                else:
                    assert sample_res[idx + j][3] == 0
                    assert sample_res[idx + j][2] != parent_path[2473624][i]
            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert idx == total_sample_num
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res[idx + j][0] == 1251533
                assert sample_res[idx + j][1] == 403522
                assert sample_res[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res[idx + j][3] == 1
                    assert sample_res[idx + j][2] == parent_path[3321007][i]
                else:
                    assert sample_res[idx + j][3] == 0
                    assert sample_res[idx + j][2] != parent_path[3321007][i]
            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert idx == total_sample_num * 2

        # check sample res with_hierarchy = True
        sample_res_with_hierarchy = tree.layerwise_sample(
            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True)
        idx = 0
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert sample_res_with_hierarchy[idx + j][0] == parent_path[315757][i]
                assert sample_res_with_hierarchy[idx + j][1] == parent_path[838060][i]
                assert sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer]
                if j == 0:
                    assert sample_res_with_hierarchy[idx + j][3] == 1
                    assert sample_res_with_hierarchy[idx + j][2] == parent_path[2473624][i]
                else:
                    assert (sample_res_with_hierarchy[idx + j][3] == 0)
                    assert (sample_res_with_hierarchy[idx + j][2] != parent_path[2473624][i])

            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert (idx == total_sample_num)
        layer = tree.height() - 1
        for i in range(len(layer_sample_counts)):
            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
                assert (sample_res_with_hierarchy[idx + j][0] == parent_path[1251533][i])
                assert (sample_res_with_hierarchy[idx + j][1] == parent_path[403522][i])
                assert (sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
                if j == 0:
                    assert (sample_res_with_hierarchy[idx + j][3] == 1)
                    assert (sample_res_with_hierarchy[idx + j][2] ==
                                    parent_path[3321007][i])
                else:
                    assert (sample_res_with_hierarchy[idx + j][3] == 0)
                    assert (sample_res_with_hierarchy[idx + j][2] != parent_path[3321007][i])

            idx += layer_sample_counts[0 - (i + 1)] + 1
            layer -= 1
        assert (idx == 2 * total_sample_num)
Пример #15
0
def train():
    """
    Create a train dataset reader containing 2913 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
Пример #16
0
    def test_tree_index(self):
        path = download(
            "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
            "tree_index_unittest", "e2ba4561c2e9432b532df40546390efa")
        '''
        path = download(
            "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
        '''
        tree = TreeIndex("demo", path)
        height = tree.height()
        branch = tree.branch()
        self.assertTrue(height == 5)
        self.assertTrue(branch == 2)
        self.assertEqual(tree.total_node_nums(), 25)
        self.assertEqual(tree.emb_size(), 30)

        # get_layer_codes
        layer_node_ids = []
        layer_node_codes = []
        for i in range(tree.height()):
            layer_node_codes.append(tree.get_layer_codes(i))
            layer_node_ids.append(
                [node.id() for node in tree.get_nodes(layer_node_codes[-1])])

        all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
        self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1]))

        # get_travel
        travel_codes = tree.get_travel_codes(all_leaf_ids[0])
        travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]

        for i in range(height):
            self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i])
            self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i])

        # get_ancestor
        ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
        ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]

        self.assertEqual(ancestor_ids[0], travel_ids[1])
        self.assertEqual(ancestor_codes[0], travel_codes[1])

        # get_pi_relation
        pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
        self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0])

        # get_travel_path
        travel_path_codes = tree.get_travel_path(travel_codes[0],
                                                 travel_codes[-1])
        travel_path_ids = [
            node.id() for node in tree.get_nodes(travel_path_codes)
        ]

        self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids)
        self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes)

        # get_children
        children_codes = tree.get_children_codes(travel_codes[1], height - 1)
        children_ids = [node.id() for node in tree.get_nodes(children_codes)]
        self.assertIn(all_leaf_ids[0], children_ids)
Пример #17
0
def test():
    """
    Create a test dataset reader containing 1464 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
Пример #18
0
def val():
    """
    Create a val dataset reader containing 1449 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
Пример #19
0
def val():
    """
    Create a val dataset reader containing 1449 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
Пример #20
0
def test():
    """
    Create a test dataset reader containing 1464 images in HWC order.
    """
    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')