Exemplo n.º 1
0
class Combiner:
    def __init__(self, num_cores, total_memory, training_data=TrainingData()):
        self.num_cores = num_cores
        self.total_memory = total_memory
        self.config_set = UniversalConfigSet(num_cores, total_memory)
        self.training_data = training_data
        if self.training_data.size() > 0:
            self.ml_model = GaussianModel(self.config_set, training_data)
            self.math_model = AbstractModel(self.config_set, training_data, num_cores, total_memory)

    def add_training_data(self, training_sample, output):
        self.training_data.add_training_data(self._get_training_config(training_sample), output)
        self.ml_model = GaussianModel(self.config_set, self.training_data)
        self.math_model = AbstractModel(self.config_set, self.training_data, self.num_cores, self.total_memory)

    def _get_training_config(self, training_sample):
        conf_names_params_mapping = {}
        for param in self.config_set.get_params():
            conf_names_params_mapping[param.get_name()] = param
        training_config = Config()
        for config_name, config_value in training_sample.items():
            training_config.add_param(conf_names_params_mapping[config_name], training_sample[config_name])
        return training_config

    def get_best_config(self):
        if self.training_data.size() == 0:
            raise ValueError("Training Data Not Provided")
        if self.training_data.size() == 1:
            return self.math_model.get_best_config()
        self.ml_model.train()
        sampled_configs = self.ml_model.get_sampled_configs()
        pruned_configs = self.math_model.get_pruned_config(sampled_configs)
        return self.ml_model.get_best_config_for_config_space(pruned_configs)
Exemplo n.º 2
0
 def test_gaussian_model_predict(self):
     training_data = TrainingData()
     config_set = UniversalConfigSet(4, 26544)
     model = GaussianModel(config_set, training_data)
     training_sample_1 = {
         "spark.executor.memory": 11945,
         "spark.sql.shuffle.partitions": 200,
         "spark.executor.cores": 2,
         "spark.driver.memory": 1024 * 4,
         "spark.sql.autoBroadcastJoinThreshold": 10,
         "spark.sql.statistics.fallBackToHdfs": 0
     }
     training_sample_2 = {
         "spark.executor.memory": 5972,
         "spark.sql.shuffle.partitions": 300,
         "spark.executor.cores": 1,
         "spark.driver.memory": 1024 * 2,
         "spark.sql.autoBroadcastJoinThreshold": 10,
         "spark.sql.statistics.fallBackToHdfs": 0
     }
     training_sample_3 = {
         "spark.executor.memory": 11945,
         "spark.sql.shuffle.partitions": 460,
         "spark.executor.cores": 2,
         "spark.driver.memory": 1024 * 4,
         "spark.sql.autoBroadcastJoinThreshold": 10,
         "spark.sql.statistics.fallBackToHdfs": 0
     }
     training_sample_4 = {
         "spark.executor.memory": 10068,
         "spark.sql.shuffle.partitions": 1660,
         "spark.executor.cores": 1,
         "spark.driver.memory": 1024,
         "spark.sql.autoBroadcastJoinThreshold": 10,
         "spark.sql.statistics.fallBackToHdfs": 0
     }
     model.add_sample_to_train_data(training_sample_1, 131)
     model.add_sample_to_train_data(training_sample_2, 143)
     model.add_sample_to_train_data(training_sample_3, 155)
     model.add_sample_to_train_data(training_sample_4, 343)
     model.train()
     config = Config()
     params = config_set.get_params()
     for param in params:
         if param.get_name() == 'spark.executor.memory':
             config.add_param(param, 10068)
         elif param.get_name() == 'spark.sql.shuffle.partitions':
             config.add_param(param, 1660)
         elif param.get_name() == 'spark.executor.cores':
             config.add_param(param, 1)
         elif param.get_name() == 'spark.driver.memory':
             config.add_param(param, 1024)
         elif param.get_name() == 'spark.sql.autoBroadcastJoinThreshold':
             config.add_param(param, 10)
         elif param.get_name() == 'spark.sql.statistics.fallBackToHdfs':
             config.add_param(param, 0)
     low, high = model.predict(
         model.normalizer.normalize_config(config.get_all_param_values()))
     assert low > (343 - 1)
     assert high < (343 + 1)
Exemplo n.º 3
0
def test_denormalization():
    config_set = UniversalConfigSet(4, 28 * 1024)
    normalizer = ConfigNormalizer(config_set)
    norm_configs = normalizer.get_all_possible_normalized_configs()
    denorm_config = normalizer.denormalize_config(norm_configs)
    i = 0
    for param in config_set.get_params():
        domain = param.get_domain()
        assert sorted(domain.get_possible_values()) == sorted(denorm_config[i])
        i = i + 1
Exemplo n.º 4
0
def test_universal_config_set(num_cores, memory, param_list):
    univ_config = UniversalConfigSet(num_cores, memory)
    assert (univ_config.get_params() == param_list)