Python H5Dataset示例，continuum.datasets.H5Dataset Python示例

示例#1

0

显示文件

def test_time(tmpdir):
    global DATA_PATH
    cl_dataset = CIFAR100(data_path=DATA_PATH,
                          download=False,
                          train=True,
                          labels_type="category",
                          task_labels="lifelong")
    # in practice the construction is part by part to reduce data load but here we do it at once
    x, y, t = cl_dataset.get_data()
    h5_filename = os.path.join(tmpdir, "test_time_h5.hdf5")
    h5dataset = H5Dataset(x, y, t, data_path=h5_filename)

    task_set = H5TaskSet(h5_filename, y=h5dataset.get_class_vector(), t=h5dataset.get_task_indexes(), trsf=None)

    start = time.time()
    for i in range(10000):
        a = task_set[5]
    end = time.time()
    print(f"normal __getitem__ {end - start}")

    start = time.time()
    with h5py.File(h5_filename, 'r') as hf:
        for i in range(10000):
            x = hf['x'][5]
            y = hf['y'][5]
            if 't' in hf.keys():
                t = hf['t'][5]
            else:
                t = -1
    end = time.time()
    print(f"open only once __getitem__ {end - start}")

示例#2

0

显示文件

def test_slice_h5(
        tmpdir,
        dataset,
        keep_classes, discard_classes,
        keep_tasks, discard_tasks,
        error,
        ids
    ):

    dataset = H5Dataset(*dataset, data_path=os.path.join(tmpdir, "test.h5"))

    if error:
        with pytest.raises(Exception):
            sliced_dataset = dataset.slice(
                os.path.join(tmpdir, "test_bis.h5"),
                keep_classes, discard_classes,
                keep_tasks, discard_tasks
            )
        return
    else:
        sliced_dataset = dataset.slice(
            os.path.join(tmpdir, "test_bis.h5"),
            keep_classes, discard_classes,
            keep_tasks, discard_tasks
        )

    h5_path, _, _ = sliced_dataset.get_data()

    assert h5_path == os.path.join(tmpdir, "test_bis.h5")
    with h5py.File(h5_path, 'r') as hf:
        x = hf['x'][:]

    assert (np.unique(x) == np.array(ids)).all(), (np.unique(x), ids)

示例#3

0

显示文件

def test_concatenate_h5dataset(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)
    h5dataset.add_data(x_, y_, t_)

    assert len(h5dataset.get_class_vector()) == 2 * len(y_)

示例#4

0

显示文件

def test_h5dataset_to_taskset(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)
    task_set = h5dataset.to_taskset()
    loader = DataLoader(task_set)
    for _ in loader:
        pass

示例#5

0

显示文件

def test_h5dataset_add_data(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)
    h5dataset.add_data(x_, y_, t_)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    assert scenario.nb_tasks == nb_task

示例#6

0

显示文件

def test_creation_h5dataset(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    x_0, y_0, t_0 = h5dataset.get_data()

    assert isinstance(x_0, str)  # x is only the path to the file
    assert len(y_0) == len(y_)
    assert len(t_0) == len(t_)

示例#7

0

显示文件

def test_h5dataset_reloading(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    # create dataset
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)
    # destroy object
    del h5dataset

    # reload data set
    h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset_reloaded)

    for task_set in scenario:
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert scenario.nb_tasks == nb_task

示例#8

0

显示文件

def test_h5dataset_reloading_slow(tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    nb_tasks = 5

    cl_dataset = CIFAR100(data_path=DATA_PATH,
                          download=False,
                          train=True,
                          labels_type="category",
                          task_labels="lifelong")
    x, y, t = cl_dataset.get_data()

    # create dataset
    h5dataset = H5Dataset(x, y, t, data_path=filename_h5)
    # destroy object
    del h5dataset

    # reload data set
    h5dataset_reloaded = H5Dataset(x=None, y=None, t=None, data_path=filename_h5)

    scenario = ContinualScenario(h5dataset_reloaded)

    for task_set in scenario:
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert scenario.nb_tasks == nb_tasks

    task_order = np.arange(nb_tasks)

    sub_scenario = create_subscenario(scenario, task_order[:-1])

    assert sub_scenario.nb_tasks == nb_tasks-1


    np.random.shuffle(task_order)
    sub_scenario = create_subscenario(scenario, task_order)
    assert sub_scenario.nb_tasks == nb_tasks

示例#9

0

显示文件

def test_h5dataset_ContinualScenario(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    assert scenario.nb_tasks == nb_task

    data_indexes = np.where(t_ == 0)[0]
    assert len(data_indexes) == len(scenario[0])

示例#10

0

显示文件

def test_h5dataset_loading(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    for task_set in scenario:
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert scenario.nb_tasks == nb_task

示例#11

0

显示文件

def test_h5dataset_get_raw(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    for task_set in scenario:
        indexes = np.random.randint(len(task_set), size=len(task_set) // 2)
        _, _, _ = task_set.get_raw_samples(indexes.sort())
        # test with no indexes
        _, _, _ = task_set.get_raw_samples()

    assert scenario.nb_tasks == nb_task

示例#12

0

显示文件

def test_create_subscenario_h5dataset(data, tmpdir):
    from continuum.scenarios import create_subscenario
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    sub_scenario = create_subscenario(scenario, np.arange(nb_task - 1))

    for task_set in sub_scenario:
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert sub_scenario.nb_tasks == nb_task - 1

示例#13

0

显示文件

def test_h5dataset_IncrementalScenario(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    nb_task = 2
    h5dataset = H5Dataset(x_, y_, None, data_path=filename_h5)

    scenario = ClassIncremental(h5dataset, nb_tasks=nb_task)

    assert scenario.nb_tasks == nb_task

    tot_len = 0
    for task_set in scenario:
        tot_len += len(task_set)
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert tot_len == len(y_)

示例#14

0

显示文件

def test_h5dataset_split_train_test(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)

    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    for task_set in scenario:
        task_set_tr, task_set_val = split_train_val(task_set)
        loader_tr = DataLoader(task_set_tr)
        for _ in loader_tr:
            pass
        loader_val = DataLoader(task_set_val)
        for _ in loader_val:
            pass

    assert scenario.nb_tasks == nb_task

示例#15

0

显示文件

文件： scenario_utils.py 项目： Continvvm/continuum

def encode_into_dataset(model,
                        scenario,
                        batch_size,
                        filename,
                        inference_fct=None):
    """This function encode a scenario into a h5 dataset to reproduce the same scenario with features.

    :param model: model to encode the data.
    :param scenario: scenario to encode.
    :param batch_size: batch size to load data.
    :param filename: filename for the h5 dataset.
    :param inference_fct: A function that make possible to have a sophisticate way to get features.
    """
    training_mode = model.training

    if inference_fct is None:
        inference_fct = (lambda model, x: model.to(torch.device('cuda:0'))
                         (x.to(torch.device('cuda:0'))))

    # we save feature in eval mode
    model.eval()

    encoded_dataset = None
    for task_id, taskset in enumerate(scenario):
        # we need to load the data to use the transformation if there is some
        loader = DataLoader(taskset, shuffle=False, batch_size=batch_size)
        for i, (x, y, t) in enumerate(loader):
            features = inference_fct(model, x)
            if t is None:
                t = (torch.ones(len(y)) * task_id).long()

            if task_id == 0 and i == 0:
                encoded_dataset = H5Dataset(features.cpu().numpy(),
                                            y,
                                            t,
                                            data_path=filename)
            else:
                encoded_dataset.add_data(features.cpu().numpy(), y, t)

    model.train(training_mode)
    return encoded_dataset

示例#16

0

显示文件

def test_on_array_dataset(tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_CIFAR100_h5.hdf5")

    cl_dataset = CIFAR100(data_path=DATA_PATH,
                          download=False,
                          train=True,
                          labels_type="category",
                          task_labels="lifelong")
    # in practice the construction is part by part to reduce data load but here we do it at once
    x, y, t = cl_dataset.get_data()
    h5dataset = H5Dataset(x, y, t, data_path=filename_h5)

    scenario = ContinualScenario(h5dataset)

    for task_set in scenario:
        loader = DataLoader(task_set, batch_size=64)
        for x, y, t in loader:
            assert x.shape == torch.Size([64, 3, 32, 32])
            break

    assert scenario.nb_tasks == 5  # number of task of CIFAR100Lifelong

示例#17

0

显示文件

def test_on_array_dataset_incremental(tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_CIFAR100_h5.hdf5")

    nb_tasks = 10

    cl_dataset = CIFAR100(data_path=DATA_PATH,
                          download=False,
                          train=True)
    # in practice the construction is part by part to reduce data load but here we do it at once
    x, y, t = cl_dataset.get_data()
    h5dataset = H5Dataset(x, y, t, data_path=filename_h5)

    scenario = ClassIncremental(h5dataset, nb_tasks=nb_tasks)

    for task_set in scenario:
        loader = DataLoader(task_set, batch_size=64)
        for x, y, t in loader:
            assert x.shape == torch.Size([64, 3, 32, 32])
            break

    assert scenario.nb_tasks == nb_tasks  # number of task of CIFAR100Lifelong

示例#18

0

显示文件

def test_create_subscenario_suffle_h5dataset(data, tmpdir):
    filename_h5 = os.path.join(tmpdir, "test_h5.hdf5")

    x_, y_, t_ = data
    h5dataset = H5Dataset(x_, y_, t_, data_path=filename_h5)


    nb_task = len(np.unique(t_))
    scenario = ContinualScenario(h5dataset)

    task_order = np.arange(nb_task)
    np.random.shuffle(task_order)


    sub_scenario = create_subscenario(scenario, task_order)

    for task_set in sub_scenario:
        loader = DataLoader(task_set)
        for _ in loader:
            pass

    assert sub_scenario.nb_tasks == nb_task