def test_dataset_serialization_deserialization_custom_basepath(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h, basedir='custom') samples = [(h.sample(), np.random.uniform()) for _ in range(5)] for sample in samples: dataset.add_sample(*sample) # serialization dataset.save_dataset() assert len(dataset) == 5 assert os.path.exists(dataset.data_path) assert os.path.exists(dataset.parameter_path) # deserialization dataset.clear() assert len(dataset) == 0 dataset.restore_dataset() assert len(dataset) == 5 assert os.path.exists(dataset.data_path) assert os.path.exists(dataset.parameter_path) # deserialization from class path = os.path.join('custom', 'datasets') dataset2 = data.Dataset.load_from_directory(path) assert dataset2.parameters is not None assert len(dataset2.X) == 5 assert len(dataset2.Y) == 5 assert len(dataset2) == 5 dataset3 = data.Dataset.load_from_directory('custom') assert dataset3.parameters is not None assert len(dataset3.X) == 5 assert len(dataset3.Y) == 5 # serialization of empty get_dataset dataset = data.Dataset(basedir='custom') with pytest.raises(FileNotFoundError): dataset.load_from_directory('null') with pytest.raises(ValueError): dataset.save_dataset()
def test_multi_set_dataset(): params = get_multi_parameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) # numpy arrays samples = [(np.array(h.sample()), np.random.uniform()) for _ in range(5)] x, y = zip(*samples) x = np.array(x) y = np.array(y) dataset.set_dataset(x, y) assert len(dataset) == 5 dataset.clear() # python arrays samples = [(h.sample(), float(np.random.uniform())) for _ in range(5)] x, y = zip(*samples) dataset.set_dataset(x, y) assert len(dataset) == 5 # None data with pytest.raises(TypeError): dataset.set_dataset(None, int(6)) with pytest.raises(TypeError): dataset.set_dataset([1, 2, 3], None) with pytest.raises(TypeError): dataset.set_dataset(None, None)
def test_dataset_multi_get_best_parameters(): params = get_multi_parameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) with pytest.raises(ValueError): dataset.get_best_parameters(None) # Test with empty dataset assert dataset.get_best_parameters() is None samples = [(h.sample(), np.random.uniform()) for _ in range(5)] for sample in samples: dataset.add_sample(*sample) objective_values = [v for h, v in samples] min_index = np.argmin(objective_values) max_index = np.argmax(objective_values) max_hp = data.flatten_parameters( dataset.get_best_parameters(objective='max')) min_hp = data.flatten_parameters( dataset.get_best_parameters(objective='min')) assert max_hp == samples[max_index][0] assert min_hp == samples[min_index][0]
def test_dataset_basedir_custom(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h, basedir='custom') assert os.path.exists(dataset.basedir) assert not os.path.exists('shac')
def test_evaluate_train_evaluate(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) # models clfs = [] # fit samples num_samples = 16 for i in range(3): samples = [h.sample() for _ in range(num_samples)] labels = [np.sum(sample) for sample in samples] x, y = samples, labels x, y = dataset.encode_dataset(x, y) model = xgb_utils.train_single_model(x, y) clfs.append(model) # test samples num_samples = 100 samples = [h.sample() for _ in range(num_samples)] ex2, _ = dataset.encode_dataset(samples, None) preds = xgb_utils.evaluate_models(ex2, clfs) count = np.sum(preds) print(count) assert preds.shape == (num_samples,) assert count > 0
def test_evaluate_single_sample(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) # models clfs = [] # fit samples num_samples = 16 for i in range(3): samples = [h.sample() for _ in range(num_samples)] labels = [np.sum(sample) for sample in samples] x, y = samples, labels x, y = dataset.encode_dataset(x, y) model = xgb_utils.train_single_model(x, y) clfs.append(model) # single sample test sample = h.sample() ex2, _ = dataset.encode_dataset([sample]) assert ex2.shape == (1, 3) pred = xgb_utils.evaluate_models(ex2, clfs) assert pred.shape == (1,)
def test_serialization_deserialization(): basepath = 'shac' params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) # models clfs = [] # fit samples num_samples = 16 for i in range(3): samples = [h.sample() for _ in range(num_samples)] labels = [np.sum(sample) for sample in samples] x, y = samples, labels x, y = dataset.encode_dataset(x, y) model = xgb_utils.train_single_model(x, y) clfs.append(model) xgb_utils.save_classifiers(clfs, basepath) assert os.path.exists(os.path.join(basepath, 'classifiers', 'classifiers.pkl')) models = xgb_utils.restore_classifiers(basepath) assert len(models) == len(clfs) with pytest.raises(FileNotFoundError): models = xgb_utils.restore_classifiers('none')
def test_dataset_parameters(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) assert len(params) == len(dataset.parameters) dataset.parameters = params assert len(params) == len(dataset.parameters)
def __init__(self, hyperparameter_list, total_budget, num_batches, objective='max', max_classifiers=18): if total_budget % num_batches != 0: raise ValueError( "Number of epochs must be divisible by the batch size !") if hyperparameter_list is not None and (not isinstance( hyperparameter_list, hp.HyperParameterList)): hyperparameter_list = hp.HyperParameterList(hyperparameter_list) print("Number of workers possible : %d" % (total_budget // num_batches)) self.parameters = hyperparameter_list self.objective = objective self._total_budget = total_budget # N self.num_batches = num_batches # M self._max_classifiers = max_classifiers self._num_workers = self.total_budget // num_batches # W self._total_classifiers = min(max(num_batches - 1, 1), max_classifiers) # K # serializable self.dataset = data.Dataset(hyperparameter_list) self.classifiers = [] # type: list(xgb.XGBClassifier) # training variables self._dataset_index = 0 self._per_classifier_budget = int( self.num_workers * np.floor(total_budget / (float(self.num_workers * (self.total_classifiers + 1))))) # Tc print( "Using %d parallel workers, it will require %d epochs to fit %d classifiers.\n" "Each classifier will be provided %d samples to train per epoch." % ( self.num_workers, total_budget // self.num_workers, self._total_classifiers, self._per_classifier_budget, )) # Compute how many threads and processes will be used self._compute_parallelism() # serialization paths self._prepare_dirs()
def test_dataset_multi_param_list(): params = get_multi_parameter_list() dataset = data.Dataset(params) assert isinstance(dataset._parameters, hp.HyperParameterList) dataset.set_parameters(params) assert isinstance(dataset._parameters, hp.HyperParameterList) h = hp.HyperParameterList(params) dataset.set_parameters(h) assert isinstance(dataset._parameters, hp.HyperParameterList)
def test_dataset_multi_add_sample(): params = get_multi_parameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) samples = [(h.sample(), np.random.uniform()) for _ in range(5)] for sample in samples: dataset.add_sample(*sample) x, y = dataset.get_dataset() assert len(dataset) == 5 assert x.shape == (5, 14) assert y.shape == (5, )
def test_dataset_single_multi_encoding_decoding_min(): params = get_multi_parameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) sample = (h.sample(), np.random.uniform()) dataset.add_sample(*sample) encoded_x, encoded_y = dataset.encode_dataset(objective='min') y_values = [0.] assert encoded_x.shape == (1, 14) assert encoded_x.dtype == np.float64 assert encoded_y.shape == (1, ) assert encoded_y.dtype == np.float64 assert np.allclose(y_values, encoded_y, rtol=1e-3) decoded_x = dataset.decode_dataset(encoded_x) assert decoded_x.shape == (1, 14)
def test_dataset_multi_encoding_decoding(): params = get_multi_parameter_list() h = hp.HyperParameterList(params, seed=0) dataset = data.Dataset(h) samples = [(h.sample(), np.random.uniform()) for _ in range(5)] for sample in samples: dataset.add_sample(*sample) encoded_x, encoded_y = dataset.encode_dataset(objective='min') y_values = [0., 0., 0., 1., 1.] assert encoded_x.shape == (5, 14) assert encoded_x.dtype == np.float64 assert encoded_y.shape == (5, ) assert encoded_y.dtype == np.float64 assert np.allclose(y_values, encoded_y, rtol=1e-3) decoded_x = dataset.decode_dataset(encoded_x) decoded_x2 = dataset.decode_dataset() assert decoded_x.shape == (5, 14) assert len(decoded_x) == len(decoded_x2) x, y = dataset.get_dataset() x_ = x[:, :10].astype('float') decoded_x_ = decoded_x[:, :10].astype('float') assert np.allclose(x_, decoded_x_, rtol=1e-3) samples2 = [(h.sample(), np.random.uniform()) for _ in range(5)] x, y = zip(*samples2) encoded_x, encoded_y = dataset.encode_dataset(x, y, objective='min') y_values = [0., 1., 0., 0., 1.] assert encoded_x.shape == (5, 14) assert encoded_x.dtype == np.float64 assert encoded_y.shape == (5, ) assert encoded_y.dtype == np.float64 assert np.allclose(y_values, encoded_y, rtol=1e-3)
def test_evaluate_train_evaluate_failure(): params = [hp.DiscreteHyperParameter('h%d' % i, [0]) for i in range(3)] h = hp.HyperParameterList(params) dataset = data.Dataset(h) # models clfs = [] # fit samples num_samples = 16 for i in range(3): samples = [h.sample() for _ in range(num_samples)] labels = [np.sum(sample) for sample in samples] x, y = samples, labels x, y = dataset.encode_dataset(x, y) model = xgb_utils.train_single_model(x, y) clfs.append(model) # test samples for model in clfs: assert model is None
def test_dataset_basedir(): params = get_hyperparameter_list() h = hp.HyperParameterList(params) dataset = data.Dataset(h) assert os.path.exists(dataset.basedir)
def test_dataset_serialization_deserialization_custom_param(): class MockDiscreteHyperParameter(hp.DiscreteHyperParameter): def __init__(self, name, values, seed=None): super(MockDiscreteHyperParameter, self).__init__(name, values, seed) # register the new hyper parameters hp.set_custom_parameter_class(MockDiscreteHyperParameter) params = get_hyperparameter_list() params.append(MockDiscreteHyperParameter('mock-param', ['x', 'y'])) h = hp.HyperParameterList(params, seed=0) dataset = data.Dataset(h) samples = [(h.sample(), np.random.uniform()) for _ in range(5)] for sample in samples: dataset.add_sample(*sample) # serialization dataset.save_dataset() assert len(dataset) == 5 assert os.path.exists(dataset.data_path) assert os.path.exists(dataset.parameter_path) # deserialization dataset.clear() assert len(dataset) == 0 dataset.restore_dataset() assert len(dataset) == 5 assert os.path.exists(dataset.data_path) assert os.path.exists(dataset.parameter_path) # deserialization from class path = os.path.join('shac', 'datasets') dataset2 = data.Dataset.load_from_directory(path) assert dataset2.parameters is not None assert len(dataset2.X) == 5 assert len(dataset2.Y) == 5 assert len(dataset2) == 5 assert 'mock-param' in dataset2.parameters.name_map.values() assert dataset2.parameters.num_choices == 5 dataset3 = data.Dataset.load_from_directory() assert dataset3.parameters is not None assert len(dataset3.X) == 5 assert len(dataset3.Y) == 5 assert 'mock-param' in dataset3.parameters.name_map.values() assert dataset3.parameters.num_choices == 5 # serialization of empty get_dataset dataset = data.Dataset() with pytest.raises(FileNotFoundError): dataset.load_from_directory('null') with pytest.raises(ValueError): dataset.save_dataset()