def add(self, name, data, input_variables = None, target_variables = None): """Add raw data to the repository Arguments: data_name {name of data} -- the name of the data added data {pandas datatable} -- the data as pandas datatable Keyword Arguments: input_variables {list of strings} -- list of column names defining the input variables for the machine learning (default: {None}). If None, all variables are used as input target_variables {list of strings} -- list of column names defining the target variables for the machine learning (default: {None}). If None, no target data is added from the table. """ path = 'raw_data/' + name if input_variables is None: input_variables = list(data) if not target_variables is None: [input_variables.remove(x) for x in target_variables] else: # check whether the input_variables are included in the data if not [item for item in input_variables if item in list(data)] == input_variables: raise Exception('RawData does not include at least one column included in input_variables') if target_variables is not None: # check if target variables are in list if not [item for item in target_variables if item in list(data)] == target_variables: raise Exception('RawData does not include at least one column included in target_variables') raw_data = repo_objects.RawData(data.loc[:, input_variables].values, input_variables, data.loc[:, target_variables].values, target_variables, repo_info = {RepoInfoKey.NAME: path}) else: raw_data = repo_objects.RawData(data.loc[:, input_variables].values, input_variables, repo_info = {RepoInfoKey.NAME: path}) v = self._repo.add(raw_data, 'data ' + path + ' added to repository' , category = MLObjectType.RAW_DATA) obj = self._repo.get(path, version=v, full_object = False) setattr(self, name, _RawDataItem(path, self._repo, obj))
def test_constructor(self): # simple construction x_data = np.zeros([100, 4]) x_names = ['x1', 'x2', 'x3', 'x4'] test_data = repo_objects.RawData(x_data, x_names) self.assertEqual(test_data.x_data.shape[0], x_data.shape[0]) self.assertEqual(test_data.n_data, 100) # construction from array x_data = np.zeros([100]) test_data = repo_objects.RawData(x_data, ['x1']) self.assertEqual(len(test_data.x_data.shape), 2) self.assertEqual(test_data.x_data.shape[1], 1) # construction from list test_data = repo_objects.RawData([100, 200, 300], ['x1']) self.assertEqual(test_data.x_data.shape[0], 3) self.assertEqual(test_data.x_data.shape[1], 1)
def add_from_numpy_file(self, name, filename_X, x_names, filename_Y=None, y_names = None): path = name X = load(filename_X) Y = None if filename_Y is not None: Y = load(filename_Y) raw_data = repo_objects.RawData(X, x_names, Y, y_names, repo_info = {RepoInfoKey.NAME: path}) v = self._repo.add(raw_data, 'data ' + path + ' added to repository' , category = MLObjectType.RAW_DATA) obj = self._repo.get(path, version=v, full_object = False) setattr(self, name, _RawDataItem(path, self._repo, obj))
def test_validation(self): # test if validation works x_data = np.zeros([100, 2]) # exception because number of coord_names does not equal number of x_coords with self.assertRaises(Exception): test_data = repo_objects.RawData( # pylint: disable=W0612 x_data, x_coord_names=[]) # pylint: disable=W0612 # exception because number of y-coords does not equal number of x-coords y_data = np.zeros([99, 2]) with self.assertRaises(Exception): test_data = repo_objects.RawData( # pylint: disable=W0612 x_data, x_coord_names=['x1', 'x2'], y_data=y_data, y_coord_names=['y1', 'y2']) # exception because number of y-coordnamess does not equal number of y-coords y_data = np.zeros([100, 2]) with self.assertRaises(Exception): test_data = repo_objects.RawData( # pylint: disable=W0612 x_data, x_coord_names=['x1', 'x2'], y_data=y_data, y_coord_names=['y1'])
def test_repo_RawData(self): """Test RawData within repo """ repository = MLRepo(user='******') job_runner = SimpleJobRunner(repository) repository._job_runner = job_runner raw_data = repo_objects.RawData( np.zeros([10, 1]), ['test_coord'], repo_info={ # pylint: disable=E0602 repo_objects.RepoInfoKey.NAME.value: 'RawData_Test' }) repository.add(raw_data, 'test commit', MLObjectType.RAW_DATA) raw_data_2 = repository.get('RawData_Test') self.assertEqual(len(raw_data.x_coord_names), len(raw_data_2.x_coord_names)) self.assertEqual(raw_data.x_coord_names[0], raw_data_2.x_coord_names[0]) commits = repository.get_commits() self.assertEqual(len(commits), 1) self.assertEqual(len(commits[0].objects), 1)
def setUp(self): '''Setup a complete ML repo with two different test data objetcs, training data, model definition etc. ''' self.repository = MLRepo(user='******') job_runner = SimpleJobRunner(self.repository) self.repository._job_runner = job_runner #### Setup dummy RawData raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_1'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_2'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) raw_data = repo_objects.RawData( np.zeros([10, 1]), ['x0'], np.zeros([10, 1]), ['y0'], repo_info={repo_objects.RepoInfoKey.NAME.value: 'raw_3'}) self.repository.add(raw_data, category=MLObjectType.RAW_DATA) ## Setup dummy Test and Training DataSets on RawData training_data = DataSet('raw_1', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_DATA }) test_data_1 = DataSet('raw_2', 0, None, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_1', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) test_data_2 = DataSet('raw_3', 0, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'test_data_2', repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TEST_DATA }) self.repository.add([training_data, test_data_1, test_data_2]) ## setup dummy preprocessor self.repository.add_preprocessing_transforming_function( preprocessor_transforming_function_test, repo_name='transform_func') self.repository.add_preprocessing_fitting_function( preprocessor_fitting_function_test, repo_name='fit_func') self.repository.add_preprocessor('test_preprocessor_with_fitting', 'transform_func', 'fit_func', preprocessor_param=None) self.repository.add_eval_function(eval_func_test, 'eval_func') self.repository.add_training_function(train_func_test, 'train_func') self.repository.add( TestClass( 1, 2, repo_info={ repo_objects.RepoInfoKey.NAME.value: 'training_param', # pylint: disable=E1123 repo_objects.RepoInfoKey.CATEGORY: MLObjectType.TRAINING_PARAM })) ## setup dummy model definition self.repository.add_model( 'model', 'eval_func', 'train_func', preprocessors=['test_preprocessor_with_fitting']) # setup measure configuration self._setup_measure_config() # add dummy calibrated model self._add_calibrated_model()