예제 #1
0
class RepoTest(unittest.TestCase):
    def _setup_measure_config(self):
        """Add a measure configuration with two measures (both MAX) where one measure just uses the coordinate x0
        """

        measure_config = repo_objects.MeasureConfiguration(
            [(repo_objects.MeasureConfiguration.MAX, ['y0']),
             repo_objects.MeasureConfiguration.MAX],
            repo_info={RepoInfoKey.NAME.value: 'measure_config'})
        self.repository.add(measure_config,
                            category=MLObjectType.MEASURE_CONFIGURATION,
                            message='adding measure configuration')

    def _add_calibrated_model(self):
        self.repository.run_training()
        self.repository.set_label('prod')

    def setUp(self):
        '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc.
        '''
        self.repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(self.repository)
        self.repository._job_runner = job_runner
        #### Setup dummy RawData
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]), ['x0'],
            np.zeros([10, 1]), ['y0'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'})
        self.repository.add(raw_data, category=MLObjectType.RAW_DATA)
        ## Setup dummy Test and Training DataSets on RawData
        training_data = DataSet('raw_1',
                                0,
                                None,
                                repo_info={
                                    repo_objects.RepoInfoKey.NAME.value:
                                    'training_data_1',
                                    repo_objects.RepoInfoKey.CATEGORY:
                                    MLObjectType.TRAINING_DATA
                                })
        test_data_1 = DataSet('raw_2',
                              0,
                              None,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_1',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        test_data_2 = DataSet('raw_3',
                              0,
                              2,
                              repo_info={
                                  repo_objects.RepoInfoKey.NAME.value:
                                  'test_data_2',
                                  repo_objects.RepoInfoKey.CATEGORY:
                                  MLObjectType.TEST_DATA
                              })
        self.repository.add([training_data, test_data_1, test_data_2])

        ## setup dummy preprocessor
        self.repository.add_preprocessing_transforming_function(
            preprocessor_transforming_function_test,
            repo_name='transform_func')
        self.repository.add_preprocessing_fitting_function(
            preprocessor_fitting_function_test, repo_name='fit_func')
        self.repository.add_preprocessor('test_preprocessor_with_fitting',
                                         'transform_func',
                                         'fit_func',
                                         preprocessor_param=None)

        self.repository.add_eval_function(eval_func_test, 'eval_func')
        self.repository.add_training_function(train_func_test, 'train_func')
        self.repository.add(
            TestClass(
                1,
                2,
                repo_info={
                    repo_objects.RepoInfoKey.NAME.value: 'training_param',  # pylint: disable=E1123
                    repo_objects.RepoInfoKey.CATEGORY:
                    MLObjectType.TRAINING_PARAM
                }))
        ## setup dummy model definition
        self.repository.add_model(
            'model',
            'eval_func',
            'train_func',
            preprocessors=['test_preprocessor_with_fitting'])
        # setup measure configuration
        self._setup_measure_config()
        # add dummy calibrated model
        self._add_calibrated_model()

    def test_adding_training_data_exception(self):
        '''Tests if adding new training data leads to an exception
        '''
        with self.assertRaises(Exception):
            test_obj = DataSet('raw_data',
                               repo_info={
                                   repo_objects.RepoInfoKey.CATEGORY:
                                   MLObjectType.TRAINING_DATA.value,
                                   'name':
                                   'test_object'
                               })
            self.repository.add(test_obj)

    def test_commit_increase_update(self):
        '''Check if updating an object in repository increases commit but does not change mapping
        '''
        obj = self.repository.get('raw_1')
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get('repo_mapping').repo_info[
            RepoInfoKey.VERSION]
        self.assertEqual(old_num_commits + 1, new_num_commits)
        self.assertEqual(old_version_mapping, new_version_mapping)

    def test_commit_increase_add(self):
        '''Check if adding a new object in repository increases commit and does also change the mapping
        '''
        obj = DataSet('raw_data_1',
                      0,
                      None,
                      repo_info={
                          RepoInfoKey.NAME.value: 'test...',
                          RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA
                      })
        old_num_commits = len(self.repository.get_commits())
        old_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.repository.add(obj)
        new_num_commits = len(self.repository.get_commits())
        new_version_mapping = self.repository.get(
            'repo_mapping').repo_info.version
        self.assertEqual(old_num_commits + 1, new_num_commits)
        commits = self.repository.get_commits()

    def test_DataSet_get(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (excluding numpy data)
        '''
        obj = self.repository.get('test_data_1')
        raw_obj = self.repository.get(obj.raw_data)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])

    def test_DataSet_get_full(self):
        '''Test if getting a DataSet does include all informations from the underlying RawData (including numpy data)
        '''
        obj = self.repository.get('test_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        raw_obj = self.repository.get(
            obj.raw_data,
            version=repo_store.RepoStore.LAST_VERSION,
            full_object=True)
        for i in range(len(raw_obj.x_coord_names)):
            self.assertEqual(raw_obj.x_coord_names[i], obj.x_coord_names[i])
        for i in range(len(raw_obj.y_coord_names)):
            self.assertEqual(raw_obj.y_coord_names[i], obj.y_coord_names[i])
        self.assertEqual(raw_obj.x_data.shape[0], obj.x_data.shape[0])

        obj = self.repository.get('test_data_2',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 2)

        obj = self.repository.get('training_data_1',
                                  version=repo_store.RepoStore.LAST_VERSION,
                                  full_object=True)
        self.assertEqual(obj.x_data.shape[0], 10)

    def test_repo_RawData(self):
        """Test RawData within repo
        """
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        raw_data = repo_objects.RawData(
            np.zeros([10, 1]),
            ['test_coord'],
            repo_info={  # pylint: disable=E0602
                repo_objects.RepoInfoKey.NAME.value: 'RawData_Test'
            })
        repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA)
        raw_data_2 = repository.get('RawData_Test')
        self.assertEqual(len(raw_data.x_coord_names),
                         len(raw_data_2.x_coord_names))
        self.assertEqual(raw_data.x_coord_names[0],
                         raw_data_2.x_coord_names[0])
        commits = repository.get_commits()
        self.assertEqual(len(commits), 1)
        self.assertEqual(len(commits[0].objects), 1)

    def test_add_model_defaults(self):
        """test add_model using defaults to check whether default logic applies correctly
        """
        model_param = TestClass(3,
                                4,
                                repo_info={
                                    RepoInfoKey.NAME.value:
                                    'model_param',
                                    RepoInfoKey.CATEGORY:
                                    MLObjectType.MODEL_PARAM.value
                                })  # pylint: disable=E1123
        self.repository.add(model_param)
        training_param = TestClass(3,
                                   4,
                                   repo_info={
                                       RepoInfoKey.NAME.value:
                                       'training_param',
                                       RepoInfoKey.CATEGORY:
                                       MLObjectType.TRAINING_PARAM.value
                                   })  # pylint: disable=E1123
        self.repository.add(training_param)
        self.repository.add_model('model1')
        model = self.repository.get('model1')
        self.assertEqual(model.eval_function, 'eval_func')
        self.assertEqual(model.training_function, 'train_func')
        self.assertEqual(model.training_param, 'training_param')
        self.assertEqual(model.model_param, 'model_param')

    def test_get_history(self):
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 1)
        training_data = self.repository.get('training_data_1')
        self.repository.add(training_data)
        training_data_history = self.repository.get_history('training_data_1')
        self.assertEqual(len(training_data_history), 2)

    def test_run_eval_defaults(self):
        '''Test running evaluation with default arguments
        '''
        self.repository.run_evaluation()

    def test_run_train_defaults(self):
        '''Test running training with default arguments
        '''
        self.repository.run_training()

    def test_run_measure_defaults(self):
        self.repository.run_evaluation(
        )  # run first the evaluation so that there is at least one evaluation
        self.repository.run_measures()

    def test_repo_training_test_data(self):
        # init repository with sample in memory handler
        repository = MLRepo(user='******')
        job_runner = SimpleJobRunner(repository)
        repository._job_runner = job_runner
        training_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'training_data'})
        repository.add(training_data, category=MLObjectType.TRAINING_DATA)

        training_data_2 = repository.get_training_data()
        self.assertEqual(
            training_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            training_data.repo_info[repo_objects.RepoInfoKey.NAME])

        test_data = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data'})
        repository.add(test_data, category=MLObjectType.TEST_DATA)
        test_data_ref = repository.get('test_data')
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data.repo_info[repo_objects.RepoInfoKey.NAME])
        self.assertEqual(
            test_data_ref.repo_info[repo_objects.RepoInfoKey.VERSION],
            test_data.repo_info[repo_objects.RepoInfoKey.VERSION])

        test_data_2 = RawData(
            np.zeros([10, 1]), ['x_values'],
            np.zeros([10, 1]), ['y_values'],
            repo_info={repo_objects.RepoInfoKey.NAME.value: 'test_data_2'})
        repository.add(test_data_2, category=MLObjectType.TEST_DATA)
        test_data_2_ref = repository.get('test_data_2')
        self.assertEqual(
            test_data_2.repo_info[repo_objects.RepoInfoKey.NAME],
            test_data_2_ref.repo_info[repo_objects.RepoInfoKey.NAME])

        commits = repository.get_commits()
        self.assertEqual(len(commits), 3)
        self.assertEqual(commits[1].objects['test_data'],
                         test_data.repo_info.version)
        #self.assertEqual(commits[1].objects['repo_mapping'], 1)
        self.assertEqual(commits[2].objects['test_data_2'],
                         test_data_2.repo_info.version)
        #self.assertEqual(commits[2].objects['repo_mapping'], 2)

    def test_repo_RegressionTest(self):
        regression_test_def = ml_tests.RegressionTestDefinition(
            repo_info={
                RepoInfoKey.NAME: 'regression_test',
                RepoInfoKey.CATEGORY: MLObjectType.TEST_DEFINITION.name
            })
        tests = regression_test_def.create(self.repository)
        self.assertEqual(len(tests), 3)
        self.repository.add(regression_test_def)
        self.repository.run_evaluation()
        self.repository.run_measures()
        self.repository.run_tests()

    def test_add_multiple(self):
        """Test adding multiple objects at once
        """
        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        self.repository.add([obj1, obj2],
                            category=MLObjectType.CALIBRATED_MODEL)
        new_obj1 = self.repository.get('obj1')
        self.assertEqual(new_obj1.repo_info.name, 'obj1')
        new_obj2 = self.repository.get('obj2')
        self.assertEqual(new_obj2.repo_info.name, 'obj2')

    def test_delete(self):
        """Test if deletion works and if it considers if there are dependencies to other objects
        """

        obj1 = TestClass(5, 4, repo_info={})
        obj1.repo_info.name = 'obj1'
        v1 = self.repository.add(obj1, category=MLObjectType.CALIBRATED_MODEL)
        obj2 = TestClass(2, 3, repo_info={})
        obj2.repo_info.name = 'obj2'
        obj2.repo_info.modification_info = {'obj1': v1}
        v2 = self.repository.add(obj2, category=MLObjectType.CALIBRATED_MODEL)
        # check if an exception is thrown if one tries to delete obj1 although obj2 has
        # a dependency on obj1
        try:
            self.repository.delete('obj1', v1)
            self.assertEqual(0, 1)
        except:
            pass
        # now first delete obj2
        self.repository.delete('obj2', v2)
        # check if obj2 has really been deleted
        try:
            obj2 = self.repository.get('obj2')
            self.assertEqual(0, 1)
        except:
            pass

        #now, deletion of obj 1 should work
        try:
            self.repository.delete('obj1', v1)
        except:
            self.assertEqual(0, 1)
        try:  #check if object really has been deleted
            obj1 = self.repository.get('obj1')
            self.assertEqual(0, 1)
        except:
            pass
예제 #2
0
    def test_tutorial(self):
        # cleanup disk before running
        repo_path = './tmp_tutorial'
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass

        # creating in memory storage
        ml_repo = MLRepo(user='******')
        # end creating in memory storage

        # creating new repository
        config = {'user': '******',
                  'workspace': repo_path,
                  'repo_store':
                  {
                      'type': 'disk_handler',
                      'config': {
                          'folder': repo_path,
                          'file_format': 'pickle'
                      }
                  },
                  'numpy_store':
                  {
                      'type': 'hdf_handler',
                      'config': {
                          'folder': repo_path,
                          'version_files': True
                      }
                  },
                  'job_runner':
                  {
                      'type': 'simple',
                      'config': {
                          'throw_job_error': True
                      }
                  }
                  }
        ml_repo = MLRepo(user='******', config=config)
        # end creating new repository
        # specifying job runner
        job_runner = SimpleJobRunner(None)
        job_runner.set_repo(ml_repo)
        ml_repo._job_runner = job_runner
        # end specifying job runner
        job_runner._throw_job_error = True

        from pailab.tools.tree import MLTree
        MLTree.add_tree(ml_repo)

        # A convenient way to add RawData is simply to use the method add on the raw_data collection.
        # This method just takes a pandas dataframe and the specification, which columns belong to the input
        # and which to the targets.

        try:
            # read pandas
            import pandas as pd
            data = pd.read_csv('./examples/boston_housing/housing.csv')
            # end read pandas
        except:
            data = pd.read_csv('../examples/boston_housing/housing.csv')

        # extract data
        input_variables = ['RM', 'LSTAT', 'PTRATIO']
        target_variables = ['MEDV']
        x = data.loc[:, input_variables].values
        y = data.loc[:, target_variables].values
        # end extract data

        # add RawData snippet
        from pailab import RawData, RepoInfoKey

        raw_data = RawData(x, input_variables, y, target_variables, repo_info={
                           RepoInfoKey.NAME: 'raw_data/boston_housing'})
        ml_repo.add(raw_data)

        # end adding RawData snippet
        # ml_repo.tree.raw_data.add('boston_housing', data, input_variables=[
        #    'RM', 'LSTAT', 'PTRATIO'], target_variables=['MEDV'])

        # add DataSet
        # create DataSet objects for training and test data
        training_data = DataSet('raw_data/boston_housing', 0, 300,
                                repo_info={RepoInfoKey.NAME: 'training_data', RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA})
        test_data = DataSet('raw_data/boston_housing', 301, None,
                            repo_info={RepoInfoKey.NAME: 'test_data',  RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA})
        # add the objects to the repository
        version_list = ml_repo.add(
            [training_data, test_data], message='add training and test data')
        # end adding DataSet

        # add model
        import pailab.externals.sklearn_interface as sklearn_interface
        from sklearn.tree import DecisionTreeRegressor
        sklearn_interface.add_model(
            ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})
        # end adding model

        # run training
        job_id = ml_repo.run_training()
        # end running training

        # run evaluation
        job_id = ml_repo.run_evaluation()
        # end running evaluation

        # add measures snippet
        ml_repo.add_measure(MeasureConfiguration.MAX)
        ml_repo.add_measure(MeasureConfiguration.R2)
        # end add measure snippet

        # run measures snippet
        job_ids = ml_repo.run_measures()
        # end run measures snippet

        print(ml_repo.get_names(MLObjectType.MEASURE))

        # get measures
        max_measure = ml_repo.get(
            'DecisionTreeRegressor/measure/training_data/max')
        print(str(max_measure.value))
        # end getting measures

        # label snippet
        from pailab import LAST_VERSION
        ml_repo.set_label('prod', 'DecisionTreeRegressor/model',
                          model_version=LAST_VERSION, message='we found our first production model')
        # end label snippet

        # test definition snippet
        import pailab.tools.tests
        reg_test = pailab.tools.tests.RegressionTestDefinition(
            reference='prod', models=None, data=None, labels=None,
            measures=[MeasureConfiguration.MAX],  tol=1000)
        reg_test.repo_info.name = 'reg_test'
        ml_repo.add(reg_test, message='regression test definition')
        # end test definition snippet

        # add test snippet
        tests = ml_repo.run_tests()
        # end add test snippet
        print(tests)

        # run check snippet
        import pailab.tools.checker as checker
        inconsistencies = checker.run(ml_repo)
        # end run check snippet

        print(inconsistencies)

        # add inconsistency snippet
        param = ml_repo.get('DecisionTreeRegressor/model_param')
        param.sklearn_params['max_depth'] = 2
        version = ml_repo.add(param)
        # end add inconsistency snippet

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_training()

        inconsistencies = checker.run(ml_repo)
        print(inconsistencies)

        ml_repo.run_evaluation(run_descendants=True)

        print(checker.run(ml_repo))

        # add second test data snippet
        test_data_2 = DataSet('raw_data/boston_housing', 0, 50,
                              repo_info={RepoInfoKey.NAME: 'test_data_2',
                                         RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA}
                              )
        ml_repo.add(test_data_2)
        ml_repo.run_evaluation(run_descendants=True)
        # end add second test data snippet

        print(checker.Tests.run(ml_repo))

        ml_repo.run_tests()

        # check tests
        print(checker.Tests.run(ml_repo))
        # end check tests

        # cleanup after running
        # job_runner.close_connection()
        ml_repo._ml_repo.close_connection()
        try:
            shutil.rmtree(repo_path)
            # os.path.
        except OSError:
            pass