Exemplo n.º 1
0
def test_dataset_serialization_deserialization_custom_basepath():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h, basedir='custom')

    samples = [(h.sample(), np.random.uniform()) for _ in range(5)]
    for sample in samples:
        dataset.add_sample(*sample)

    # serialization
    dataset.save_dataset()

    assert len(dataset) == 5
    assert os.path.exists(dataset.data_path)
    assert os.path.exists(dataset.parameter_path)

    # deserialization
    dataset.clear()
    assert len(dataset) == 0

    dataset.restore_dataset()

    assert len(dataset) == 5
    assert os.path.exists(dataset.data_path)
    assert os.path.exists(dataset.parameter_path)

    # deserialization from class
    path = os.path.join('custom', 'datasets')
    dataset2 = data.Dataset.load_from_directory(path)

    assert dataset2.parameters is not None
    assert len(dataset2.X) == 5
    assert len(dataset2.Y) == 5
    assert len(dataset2) == 5

    dataset3 = data.Dataset.load_from_directory('custom')

    assert dataset3.parameters is not None
    assert len(dataset3.X) == 5
    assert len(dataset3.Y) == 5

    # serialization of empty get_dataset
    dataset = data.Dataset(basedir='custom')

    with pytest.raises(FileNotFoundError):
        dataset.load_from_directory('null')

    with pytest.raises(ValueError):
        dataset.save_dataset()
Exemplo n.º 2
0
def test_multi_set_dataset():
    params = get_multi_parameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)
    # numpy arrays
    samples = [(np.array(h.sample()), np.random.uniform()) for _ in range(5)]

    x, y = zip(*samples)
    x = np.array(x)
    y = np.array(y)
    dataset.set_dataset(x, y)
    assert len(dataset) == 5

    dataset.clear()

    # python arrays
    samples = [(h.sample(), float(np.random.uniform())) for _ in range(5)]

    x, y = zip(*samples)
    dataset.set_dataset(x, y)
    assert len(dataset) == 5

    # None data
    with pytest.raises(TypeError):
        dataset.set_dataset(None, int(6))

    with pytest.raises(TypeError):
        dataset.set_dataset([1, 2, 3], None)

    with pytest.raises(TypeError):
        dataset.set_dataset(None, None)
Exemplo n.º 3
0
def test_dataset_multi_get_best_parameters():
    params = get_multi_parameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    with pytest.raises(ValueError):
        dataset.get_best_parameters(None)

    # Test with empty dataset
    assert dataset.get_best_parameters() is None

    samples = [(h.sample(), np.random.uniform()) for _ in range(5)]

    for sample in samples:
        dataset.add_sample(*sample)

    objective_values = [v for h, v in samples]
    min_index = np.argmin(objective_values)
    max_index = np.argmax(objective_values)

    max_hp = data.flatten_parameters(
        dataset.get_best_parameters(objective='max'))
    min_hp = data.flatten_parameters(
        dataset.get_best_parameters(objective='min'))

    assert max_hp == samples[max_index][0]
    assert min_hp == samples[min_index][0]
Exemplo n.º 4
0
def test_dataset_basedir_custom():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h, basedir='custom')
    assert os.path.exists(dataset.basedir)
    assert not os.path.exists('shac')
Exemplo n.º 5
0
def test_evaluate_train_evaluate():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    # test samples
    num_samples = 100
    samples = [h.sample() for _ in range(num_samples)]
    ex2, _ = dataset.encode_dataset(samples, None)

    preds = xgb_utils.evaluate_models(ex2, clfs)
    count = np.sum(preds)

    print(count)
    assert preds.shape == (num_samples,)
    assert count > 0
Exemplo n.º 6
0
def test_evaluate_single_sample():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    # single sample test
    sample = h.sample()
    ex2, _ = dataset.encode_dataset([sample])

    assert ex2.shape == (1, 3)

    pred = xgb_utils.evaluate_models(ex2, clfs)
    assert pred.shape == (1,)
Exemplo n.º 7
0
def test_serialization_deserialization():
    basepath = 'shac'

    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    xgb_utils.save_classifiers(clfs, basepath)
    assert os.path.exists(os.path.join(basepath, 'classifiers', 'classifiers.pkl'))

    models = xgb_utils.restore_classifiers(basepath)
    assert len(models) == len(clfs)

    with pytest.raises(FileNotFoundError):
        models = xgb_utils.restore_classifiers('none')
Exemplo n.º 8
0
def test_dataset_parameters():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)
    assert len(params) == len(dataset.parameters)

    dataset.parameters = params
    assert len(params) == len(dataset.parameters)
Exemplo n.º 9
0
    def __init__(self,
                 hyperparameter_list,
                 total_budget,
                 num_batches,
                 objective='max',
                 max_classifiers=18):
        if total_budget % num_batches != 0:
            raise ValueError(
                "Number of epochs must be divisible by the batch size !")

        if hyperparameter_list is not None and (not isinstance(
                hyperparameter_list, hp.HyperParameterList)):
            hyperparameter_list = hp.HyperParameterList(hyperparameter_list)

        print("Number of workers possible : %d" %
              (total_budget // num_batches))

        self.parameters = hyperparameter_list
        self.objective = objective
        self._total_budget = total_budget  # N
        self.num_batches = num_batches  # M

        self._max_classifiers = max_classifiers
        self._num_workers = self.total_budget // num_batches  # W
        self._total_classifiers = min(max(num_batches - 1, 1),
                                      max_classifiers)  # K

        # serializable
        self.dataset = data.Dataset(hyperparameter_list)
        self.classifiers = []  # type: list(xgb.XGBClassifier)

        # training variables
        self._dataset_index = 0
        self._per_classifier_budget = int(
            self.num_workers *
            np.floor(total_budget /
                     (float(self.num_workers *
                            (self.total_classifiers + 1)))))  # Tc

        print(
            "Using %d parallel workers, it will require %d epochs to fit %d classifiers.\n"
            "Each classifier will be provided %d samples to train per epoch." %
            (
                self.num_workers,
                total_budget // self.num_workers,
                self._total_classifiers,
                self._per_classifier_budget,
            ))

        # Compute how many threads and processes will be used
        self._compute_parallelism()

        # serialization paths
        self._prepare_dirs()
Exemplo n.º 10
0
def test_dataset_multi_param_list():
    params = get_multi_parameter_list()

    dataset = data.Dataset(params)
    assert isinstance(dataset._parameters, hp.HyperParameterList)

    dataset.set_parameters(params)
    assert isinstance(dataset._parameters, hp.HyperParameterList)

    h = hp.HyperParameterList(params)
    dataset.set_parameters(h)
    assert isinstance(dataset._parameters, hp.HyperParameterList)
Exemplo n.º 11
0
def test_dataset_multi_add_sample():
    params = get_multi_parameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    samples = [(h.sample(), np.random.uniform()) for _ in range(5)]
    for sample in samples:
        dataset.add_sample(*sample)

    x, y = dataset.get_dataset()
    assert len(dataset) == 5
    assert x.shape == (5, 14)
    assert y.shape == (5, )
Exemplo n.º 12
0
def test_dataset_single_multi_encoding_decoding_min():
    params = get_multi_parameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    sample = (h.sample(), np.random.uniform())
    dataset.add_sample(*sample)

    encoded_x, encoded_y = dataset.encode_dataset(objective='min')
    y_values = [0.]

    assert encoded_x.shape == (1, 14)
    assert encoded_x.dtype == np.float64
    assert encoded_y.shape == (1, )
    assert encoded_y.dtype == np.float64
    assert np.allclose(y_values, encoded_y, rtol=1e-3)

    decoded_x = dataset.decode_dataset(encoded_x)
    assert decoded_x.shape == (1, 14)
Exemplo n.º 13
0
def test_dataset_multi_encoding_decoding():
    params = get_multi_parameter_list()
    h = hp.HyperParameterList(params, seed=0)

    dataset = data.Dataset(h)

    samples = [(h.sample(), np.random.uniform()) for _ in range(5)]
    for sample in samples:
        dataset.add_sample(*sample)

    encoded_x, encoded_y = dataset.encode_dataset(objective='min')
    y_values = [0., 0., 0., 1., 1.]

    assert encoded_x.shape == (5, 14)
    assert encoded_x.dtype == np.float64
    assert encoded_y.shape == (5, )
    assert encoded_y.dtype == np.float64
    assert np.allclose(y_values, encoded_y, rtol=1e-3)

    decoded_x = dataset.decode_dataset(encoded_x)
    decoded_x2 = dataset.decode_dataset()
    assert decoded_x.shape == (5, 14)
    assert len(decoded_x) == len(decoded_x2)

    x, y = dataset.get_dataset()
    x_ = x[:, :10].astype('float')
    decoded_x_ = decoded_x[:, :10].astype('float')
    assert np.allclose(x_, decoded_x_, rtol=1e-3)

    samples2 = [(h.sample(), np.random.uniform()) for _ in range(5)]
    x, y = zip(*samples2)

    encoded_x, encoded_y = dataset.encode_dataset(x, y, objective='min')
    y_values = [0., 1., 0., 0., 1.]

    assert encoded_x.shape == (5, 14)
    assert encoded_x.dtype == np.float64
    assert encoded_y.shape == (5, )
    assert encoded_y.dtype == np.float64
    assert np.allclose(y_values, encoded_y, rtol=1e-3)
Exemplo n.º 14
0
def test_evaluate_train_evaluate_failure():
    params = [hp.DiscreteHyperParameter('h%d' % i, [0]) for i in range(3)]
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    # test samples
    for model in clfs:
        assert model is None
Exemplo n.º 15
0
def test_dataset_basedir():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)
    assert os.path.exists(dataset.basedir)
Exemplo n.º 16
0
def test_dataset_serialization_deserialization_custom_param():
    class MockDiscreteHyperParameter(hp.DiscreteHyperParameter):
        def __init__(self, name, values, seed=None):
            super(MockDiscreteHyperParameter,
                  self).__init__(name, values, seed)

    # register the new hyper parameters
    hp.set_custom_parameter_class(MockDiscreteHyperParameter)

    params = get_hyperparameter_list()
    params.append(MockDiscreteHyperParameter('mock-param', ['x', 'y']))

    h = hp.HyperParameterList(params, seed=0)

    dataset = data.Dataset(h)

    samples = [(h.sample(), np.random.uniform()) for _ in range(5)]
    for sample in samples:
        dataset.add_sample(*sample)

    # serialization
    dataset.save_dataset()

    assert len(dataset) == 5
    assert os.path.exists(dataset.data_path)
    assert os.path.exists(dataset.parameter_path)

    # deserialization
    dataset.clear()
    assert len(dataset) == 0

    dataset.restore_dataset()

    assert len(dataset) == 5
    assert os.path.exists(dataset.data_path)
    assert os.path.exists(dataset.parameter_path)

    # deserialization from class
    path = os.path.join('shac', 'datasets')
    dataset2 = data.Dataset.load_from_directory(path)

    assert dataset2.parameters is not None
    assert len(dataset2.X) == 5
    assert len(dataset2.Y) == 5
    assert len(dataset2) == 5

    assert 'mock-param' in dataset2.parameters.name_map.values()
    assert dataset2.parameters.num_choices == 5

    dataset3 = data.Dataset.load_from_directory()

    assert dataset3.parameters is not None
    assert len(dataset3.X) == 5
    assert len(dataset3.Y) == 5

    assert 'mock-param' in dataset3.parameters.name_map.values()
    assert dataset3.parameters.num_choices == 5

    # serialization of empty get_dataset
    dataset = data.Dataset()

    with pytest.raises(FileNotFoundError):
        dataset.load_from_directory('null')

    with pytest.raises(ValueError):
        dataset.save_dataset()