示例#1
0
    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
示例#2
0
 def test_repo_RawData(self):
     """Test RawData within repo
     """
     repository = MLRepo(user='******')
     job_runner = SimpleJobRunner(repository)
     repository._job_runner = job_runner
     raw_data = repo_objects.RawData(
         np.zeros([10, 1]),
         ['test_coord'],
         repo_info={  # pylint: disable=E0602
             repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
         })
     repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
     raw_data_2 = repository.get('RawData_Test')
     self.assertEqual(len(raw_data.x_coord_names),
                      len(raw_data_2.x_coord_names))
     self.assertEqual(raw_data.x_coord_names[0],
                      raw_data_2.x_coord_names[0])
     commits = repository.get_commits()
     self.assertEqual(len(commits), 1)
     self.assertEqual(len(commits[0].objects), 1)
示例#3
0
class RepoTest(unittest.TestCase):
    def _setup_measure_config(self):
        """Add a measure configuration with two measures (both MAX) where one measure just uses the coordinate x0
        """

        measure_config = repo_objects.MeasureConfiguration(
            [(repo_objects.MeasureConfiguration.MAX, ['y0']),
             repo_objects.MeasureConfiguration.MAX],
            repo_info={RepoInfoKey.NAME.value: 'measure_config'})
        self.repository.add(measure_config,
                            category=MLObjectType.MEASURE_CONFIGURATION,
                            message='adding measure configuration')

    def _add_calibrated_model(self):
        self.repository.run_training()
        self.repository.set_label('prod')

    def setUp(self):
        '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc.
        '''
        self.repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(self.repository)
        self.repository._job_runner = job_runner
        #### Setup dummy RawData
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        ## Setup dummy Test and Training DataSets on RawData
        training_data = DataSet('raw_1',
                                0,
                                None,
                                repo_info={
                                    repo_objects.RepoInfoKey.NAME.value:
                                    'training_data_1',
                                    repo_objects.RepoInfoKey.CATEGORY:
                                    MLObjectType.TRAINING_DATA
                                })
        test_data_1 = DataSet('raw_2',
                              0,
                              None,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_1',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        test_data_2 = DataSet('raw_3',
                              0,
                              2,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_2',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        self.repository.add([training_data, test_data_1, test_data_2])

        ## setup dummy preprocessor
        self.repository.add_preprocessing_transforming_function(
            preprocessor_transforming_function_test,
            repo_name='transform_func')
        self.repository.add_preprocessing_fitting_function(
            preprocessor_fitting_function_test, repo_name='fit_func')
        self.repository.add_preprocessor('test_preprocessor_with_fitting',
                                         'transform_func',
                                         'fit_func',
                                         preprocessor_param=None)

        self.repository.add_eval_function(eval_func_test, 'eval_func')
        self.repository.add_training_function(train_func_test, 'train_func')
        self.repository.add(
            TestClass(
                1,
                2,
                repo_info={
                    repo_objects.RepoInfoKey.NAME.value: 'training_param',  # pylint: disable=E1123
                    repo_objects.RepoInfoKey.CATEGORY:
                    MLObjectType.TRAINING_PARAM
                }))
        ## setup dummy model definition
        self.repository.add_model(
            'model',
            'eval_func',
            'train_func',
            preprocessors=['test_preprocessor_with_fitting'])
        # setup measure configuration
        self._setup_measure_config()
        # add dummy calibrated model
        self._add_calibrated_model()

    def test_adding_training_data_exception(self):
        '''Tests if adding new training data leads to an exception
        '''
        with self.assertRaises(Exception):
            test_obj = DataSet('raw_data',
                               repo_info={
                                   repo_objects.RepoInfoKey.CATEGORY:
                                   MLObjectType.TRAINING_DATA.value,
                                   'name':
                                   'test_object'
                               })
            self.repository.add(test_obj)

    def test_commit_increase_update(self):
        '''Check if updating an object in repository increases commit but does not change mapping
        '''
        obj = self.repository.get('raw_1')
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.assertEqual(old_num_commits + 1, new_num_commits)
        self.assertEqual(old_version_mapping, new_version_mapping)

    def test_commit_increase_add(self):
        '''Check if adding a new object in repository increases commit and does also change the mapping
        '''
        obj = DataSet('raw_data_1',
                      0,
                      None,
                      repo_info={
                          RepoInfoKey.NAME.value: 'test...',
                          RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA
                      })
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.assertEqual(old_num_commits + 1, new_num_commits)
        commits = self.repository.get_commits()

    def test_DataSet_get(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (excluding numpy data)
        '''
        obj = self.repository.get('test_data_1')
        raw_obj = self.repository.get(obj.raw_data)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])

    def test_DataSet_get_full(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (including numpy data)
        '''
        obj = self.repository.get('test_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        raw_obj = self.repository.get(
            obj.raw_data,
            version=repo_store.RepoStore.LAST_VERSION,
            full_object=True)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])
        self.assertEqual(raw_obj.x_data.shape[0], obj.x_data.shape[0])

        obj = self.repository.get('test_data_2',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 2)

        obj = self.repository.get('training_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 10)

    def test_repo_RawData(self):
        """Test RawData within repo
        """
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]),
            ['test_coord'],
            repo_info={  # pylint: disable=E0602
                repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
            })
        repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
        raw_data_2 = repository.get('RawData_Test')
        self.assertEqual(len(raw_data.x_coord_names),
                         len(raw_data_2.x_coord_names))
        self.assertEqual(raw_data.x_coord_names[0],
                         raw_data_2.x_coord_names[0])
        commits = repository.get_commits()
        self.assertEqual(len(commits), 1)
        self.assertEqual(len(commits[0].objects), 1)

    def test_add_model_defaults(self):
        """test add_model using defaults to check whether default logic applies correctly
        """
        model_param = TestClass(3,
                                4,
                                repo_info={
                                    RepoInfoKey.NAME.value:
                                    'model_param',
                                    RepoInfoKey.CATEGORY:
                                    MLObjectType.MODEL_PARAM.value
                                })  # pylint: disable=E1123
        self.repository.add(model_param)
        training_param = TestClass(3,
                                   4,
                                   repo_info={
                                       RepoInfoKey.NAME.value:
                                       'training_param',
                                       RepoInfoKey.CATEGORY:
                                       MLObjectType.TRAINING_PARAM.value
                                   })  # pylint: disable=E1123
        self.repository.add(training_param)
        self.repository.add_model('model1')
        model = self.repository.get('model1')
        self.assertEqual(model.eval_function, 'eval_func')
        self.assertEqual(model.training_function, 'train_func')
        self.assertEqual(model.training_param, 'training_param')
        self.assertEqual(model.model_param, 'model_param')

    def test_get_history(self):
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 1)
        training_data = self.repository.get('training_data_1')
        self.repository.add(training_data)
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 2)

    def test_run_eval_defaults(self):
        '''Test running evaluation with default arguments
        '''
        self.repository.run_evaluation()

    def test_run_train_defaults(self):
        '''Test running training with default arguments
        '''
        self.repository.run_training()

    def test_run_measure_defaults(self):
        self.repository.run_evaluation(
        )  # run first the evaluation so that there is at least one evaluation
        self.repository.run_measures()

    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
        #self.assertEqual(commits[2].objects['repo_mapping'], 2)

    def test_repo_RegressionTest(self):
        regression_test_def = ml_tests.RegressionTestDefinition(
            repo_info={
                RepoInfoKey.NAME: 'regression_test',
                RepoInfoKey.CATEGORY: MLObjectType.TEST_DEFINITION.name
            })
        tests = regression_test_def.create(self.repository)
        self.assertEqual(len(tests), 3)
        self.repository.add(regression_test_def)
        self.repository.run_evaluation()
        self.repository.run_measures()
        self.repository.run_tests()

    def test_add_multiple(self):
        """Test adding multiple objects at once
        """
        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        self.repository.add([obj1, obj2],
                            category=MLObjectType.CALIBRATED_MODEL)
        new_obj1 = self.repository.get('obj1')
        self.assertEqual(new_obj1.repo_info.name, 'obj1')
        new_obj2 = self.repository.get('obj2')
        self.assertEqual(new_obj2.repo_info.name, 'obj2')

    def test_delete(self):
        """Test if deletion works and if it considers if there are dependencies to other objects
        """

        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        obj2.repo_info.modification_info = {'obj1': v1}
        v2 = self.repository.add(obj2, category=MLObjectType.CALIBRATED_MODEL)
        # check if an exception is thrown if one tries to delete obj1 although obj2 has
        # a dependency on obj1
        try:
            self.repository.delete('obj1', v1)
            self.assertEqual(0, 1)
        except:
            pass
        # now first delete obj2
        self.repository.delete('obj2', v2)
        # check if obj2 has really been deleted
        try:
            obj2 = self.repository.get('obj2')
            self.assertEqual(0, 1)
        except:
            pass

        #now, deletion of obj 1 should work
        try:
            self.repository.delete('obj1', v1)
        except:
            self.assertEqual(0, 1)
        try:  #check if object really has been deleted
            obj1 = self.repository.get('obj1')
            self.assertEqual(0, 1)
        except:
            pass