def process_inputs(self, value): m = Member(name=value.split(':', 1)[0], type_=value.split(':', 1)[1]) self.names.append(m.name) if m.type_ in ['int:id', 'str:id']: self.ids.append(f'self.{m.name}') m.type_ = m.type_[:-3] self.arguments.append(f'{m.name}: {m.type_}') self.initialisations.append( f'{m.name}={DEFAULT_INITIALISATIONS.get(m.type_, "None")}') if m.type_.startswith('List['): self.isListDependent = True m.type_ = m.type_[5:-1] m.isList = True if m.type_ == 'object': if m.isList: raise ValueError('List[object] is an invalid type') m.isObject = True self.conversions.append(m) elif m.type_ in TEMPORAL: m.to_ = 'isoformat' m.from_ = 'fromisoformat' m.isTemporal = True self.conversions.append(m) elif self.project.is_data_model_class(NameString(m.type_)): m.to_ = 'to_data' m.from_ = 'from_data' m.type_ = NameString(m.type_) self.conversions.append(m) elif m.type_ not in ['int', 'str', 'float']: raise ValueError(f'Unknown type: {value}')
def _init_known_class_lists(self): self._dataModelClasses = [] self._taskClasses = [] self._modelBlockClasses = [] if os.path.exists(self.dataModelsPath): dataModelFiles = glob.glob(str(Path(self.dataModelsPath, '[!_][!_]*.py'))) self._dataModelClasses = [NameString(os.path.split(filePath)[1][:-3]) for filePath in dataModelFiles] if os.path.exists(self.tasksPath): taskFiles = glob.glob(str(Path(self.projectDirectory, 'tasks', '[!_][!_]*.py'))) self._taskClasses = [NameString(os.path.split(filePath)[1][:-3]) for filePath in taskFiles] if os.path.exists(self.blocksPath): blockFiles = glob.glob(str(Path(self.projectDirectory, 'models', 'blocks', '[!_][!_]*.py'))) self._modelBlockClasses = [NameString(os.path.split(filePath)[1][:-3]) for filePath in blockFiles]
def list_datasets(self, pattern=None, asCode=False): """Convenience function to list datasets for a project Returns a list of data loaded from the ``.def`` files in the directory Parameters ---------- pattern : string (None) Regex pattern to filter on dataset names, if unspecified, defaults to ``.*`` asCode : bool (False) If True prints a code snippet that allows the dataset to be loaded (with imports and path updates) """ if pattern is None: pattern = '.*' dataPath = Path(self.dataDirectory, self.projectName.asSnake) result = [] for pathName, _, fileNames in os.walk(dataPath): for fileName in fileNames: if fileName.endswith('.def') and re.match(pattern, fileName[:-4]) is not None: data = json.load(open(Path(pathName, fileName), 'rt')) result.append(data) if asCode: values = {**data, **data['repo']} values['location'] = self.dataDirectory values['commitMessage'] = values['commitMessage'].replace('\n', '\\n') values['dataTypeFile'] = NameString(name=values['dataType']).asSnake values['projectDirectory'] = self.projectDirectory print(DATASET_TEMPLATE.format(**values)) return result
def create_model_block(className, projectDirectory='.', dryrun=None, force=None): """Generates a Model Block class. The file will be located in ``project_name/models/blocks/block_name.py`` Parameters ---------- className : string (CamelCase) Name of the class to be created """ project = HypergolProject(projectDirectory=projectDirectory, dryrun=dryrun, force=force) className = NameString(className) content = project.render( templateName='model_block.py.j2', templateData={'className': className}, filePath=Path(projectDirectory, 'models', 'blocks', className.asFileName) ) return project.cli_final_message(creationType='ModelBlock', name=className, content=(content, ))
def create_old_data_model(self, commit, *args): """Convenience function to generate data model classes at an old commit to be able to load datasets created then Full commit hash required. ``project.create_old_data_model(commit='fbd8110b7194425e2323f68ef54dac15bb01ee7b', 'OneClass', 'TwoClass')`` Will create ``data_models/one_class_fbd8110.py`` and ``data_models/two_class_fbd8110.py`` and replaces all occurences of ``OneClass`` and ``TwoClass`` to ``OneClassFBD8110`` and ``TwoClassFBD8110`` in each file. Parameters ---------- commit : string git commit to retrieve classes from args : List[string] List of class names to generate, if empty it generates all """ if len(args) == 0: names = self._dataModelClasses else: names = [NameString(name) for name in args] result = [] repo = Repo(self.projectDirectory) if repo.is_dirty(): print('Warning! The current git repo is dirty; this will result in incorrect data_model_files created.') for name in names: content = repo.git.show(f'{commit}:data_models/{name.asSnake}.py') for oldName in names: content = content.replace(oldName.asClass, f'{oldName.asClass}{commit[:7].upper()}') content = content.replace(f'data_models.{oldName.asSnake}', f'data_models.{oldName.asSnake}_{commit[:7]}') if self.isDryRun: result.append(content) print(f'DRYRUN - Creating class {name.asClass}{commit[:7].upper()} in {name.asSnake}_{commit[:7]}.py') print(content+'\n') else: print(f'Creating class {name.asClass}{commit[:7].upper()} in {name.asSnake}_{commit[:7]}.py') with open(Path(self.dataModelsPath, f'{name.asSnake}_{commit[:7]}.py'), 'wt') as outFile: outFile.write(content+'\n') self._init_known_class_lists() return result
def diff_data_model(self, commit, *args): """Convenience function to compare old data model class definitions to the current one Prints the diffs from the specified commit to the current commit Parameters ---------- commit : string The git commit from where the comparison starts *args : List[string] List of class names to compare, if empty it compares all """ if len(args) == 0: names = self._dataModelClasses else: names = [NameString(name) for name in args] repo = Repo(self.projectDirectory) if repo.is_dirty(): print('Warning! Current git repo is dirty, this will result in incorrect diff') currentCommit = repo.commit().hexsha for name in names: print(f'------ data_models/{name.asSnake}.py ------') print(repo.git.diff(commit, currentCommit, f'data_models/{name.asSnake}.py'))
def __init__(self, projectName, methodName): super(HypergolCreateTestCase, self).__init__(methodName=methodName) self.projectName = projectName self.projectDirectory = NameString(self.projectName).asSnake self.allPaths = []
def __init__(self, projectDirectory=None, dataDirectory='.', chunkCount=16, dryrun=None, force=None, repoManager=None): """ Parameters ---------- projectDirectory : string location of the project: e.g.: ``~/repo_name``, models will be in ``~/repo_name/models`` projectDirectory : string location of the data for the project project: e.g.: ``~/data``, files will be stored in ``~/data/repo_name`` dryrun : bool (default=None) If set to ``True`` it returns the generated code as a string force : bool (default=None) If set to ``True`` it overwrites the target file """ if force and dryrun: raise ValueError('Both force and dryrun are set') if projectDirectory is None: projectDirectory = os.getcwd() if projectDirectory.endswith('/'): projectDirectory = projectDirectory[:-1] if dataDirectory.endswith('/'): dataDirectory = dataDirectory[:-1] if repoManager is None: repoManager = RepoManager(repoDirectory=projectDirectory, raiseIfDirty=not force) self.repoManager = repoManager self.projectName = NameString(os.path.basename(projectDirectory)) self.projectDirectory = projectDirectory self.dataDirectory = dataDirectory self.dataModelsPath = Path(projectDirectory, 'data_models') self.tasksPath = Path(projectDirectory, 'tasks') self.pipelinesPath = Path(projectDirectory, 'pipelines') self.modelsPath = Path(projectDirectory, 'models') self.blocksPath = Path(projectDirectory, 'models', 'blocks') self.testsPath = Path(projectDirectory, 'tests') self._init_known_class_lists() self.templateEnvironment = jinja2.Environment( loader=jinja2.FileSystemLoader( searchpath=Path(hypergol.__path__[0], 'cli', 'templates') ) ) self.mode = Mode.DRY_RUN if dryrun else Mode.FORCE if force else Mode.NORMAL if not self.repoManager.repoExists: self.datasetFactory = None self.tensorboardPath = None self.modelDataPath = None print('Repo does not exist, data related functionality disabled.') return self.datasetFactory = DatasetFactory( location=self.dataDirectory, project=self.projectName.asSnake, branch=self.repoManager.branchName, chunkCount=chunkCount, repoData=RepoData( branchName=self.repoManager.branchName, commitHash=self.repoManager.commitHash, commitMessage=self.repoManager.commitMessage, comitterName=self.repoManager.comitterName, comitterEmail=self.repoManager.comitterEmail ) ) self.tensorboardPath = Path(dataDirectory, self.projectName.asSnake, 'tensorboard', self.repoManager.branchName) self.modelDataPath = Path(dataDirectory, self.projectName.asSnake, self.repoManager.branchName, 'models')
def get_class_name(name): if name[-1] == 's' and name not in plurals: raise ValueError(f'{name} not in plurals') return NameString(plurals.get(name, name))
def create_model(modelName, trainingClass, evaluationClass, inputClass, outputClass, *args, projectDirectory='.', dryrun=None, force=None): """Generates stubs for the Tensorflow model, data processing class and training script and shell script to run it from the command line. Shell scripts will be located in the project main directory (which should be the current directory when running them) and model files will be located in ``project_name/models/model_name/*.py``. After creation the user must implement the ``process_training_batch()`` , ``process_evaluation_batch()``, ``process_input_batch()`` and ``process_output_batch`` member functions that take ``trainingClass``, ``evaluationClass``, ``inputClass`` and ``outputClass`` respectively. The model must implement the ``get_loss()``, ``produce_metrics()`` and ``get_outputs()`` functions (see documentation of :class:`.BaseTensorflowModel` and the ``Tutorial`` for more detailed instructions) The training script is generated with example stubs that should be modified to align with the created model. Parameters ---------- modelName : string Name of the model trainingClass : BaseData Datamodel class (must exist) of the Dataset that contains the training data evaluationClass : BaseData Datamodel class (must exist) that will contain the evaluation data inputClass : BaseData Datamodel class (must exist) that will be used as the input when serving the model outputClass : BaseData Datamodel class (must exist) that will be returned as output when serving the model *args : BaseTensorflowModelBlock Names of blocks that will build up the model """ project = HypergolProject(projectDirectory=projectDirectory, dryrun=dryrun, force=force) modelName = NameString(modelName) trainingClass = NameString(trainingClass) evaluationClass = NameString(evaluationClass) inputClass = NameString(inputClass) outputClass = NameString(outputClass) blocks = [NameString(value) for value in args] project.check_dependencies( [trainingClass, evaluationClass, inputClass, outputClass] + blocks) project.create_model_directory(modelName=modelName) project.render_simple(templateName='__init__.py.j2', filePath=Path(project.modelsPath, modelName.asSnake, '__init__.py')) content = project.render(templateName='model.py.j2', templateData={ 'name': modelName, }, filePath=Path(projectDirectory, 'models', modelName.asSnake, modelName.asFileName)) batchProcessorContent = project.render( templateName='batch_processor.py.j2', templateData={ 'name': modelName, 'evaluationClass': evaluationClass, 'outputClass': outputClass, }, filePath=Path(projectDirectory, 'models', modelName.asSnake, f'{modelName.asSnake}_batch_processor.py')) trainModelContent = project.render( templateName='train_model.py.j2', templateData={ 'modelName': modelName, 'trainingClass': trainingClass, 'evaluationClass': evaluationClass, 'blockDependencies': [name for name in blocks if project.is_model_block_class(name)], }, filePath=Path(projectDirectory, 'models', modelName.asSnake, f'train_{modelName.asFileName}')) scriptContent = project.render_executable( templateName='train_model.sh.j2', templateData={'snakeName': modelName.asSnake}, filePath=Path(projectDirectory, f'train_{modelName.asSnake}.sh')) serveContent = project.render( templateName='serve_model.py.j2', templateData={ 'modelName': modelName, 'inputClass': inputClass, 'outputClass': outputClass }, filePath=Path(projectDirectory, 'models', modelName.asSnake, f'serve_{modelName.asFileName}')) serveScriptContent = project.render_executable( templateName='serve_model.sh.j2', templateData={'snakeName': modelName.asSnake}, filePath=Path(projectDirectory, f'serve_{modelName.asSnake}.sh')) return project.cli_final_message( creationType='Model', name=modelName, content=(content, batchProcessorContent, trainModelContent, scriptContent, serveContent, serveScriptContent))
def test_name_string_has_correct_properties(self): ns = NameString(name='TestClass') self.assertEqual(ns.asSnake, 'test_class') self.assertEqual(ns.asClass, 'TestClass') self.assertEqual(ns.asVariable, 'testClass') self.assertEqual(ns.asPluralVariable, 'testClasses')
def test_name_string_equality(self): ns1 = NameString('GitHub') ns2 = NameString('git_hub') self.assertEqual(ns1, ns2)
def test_name_string_returns_correct_plural(self): ns = NameString(name='BigCity', plural='BigCities') self.assertEqual(ns.asPluralVariable, 'bigCities') self.assertEqual(ns.asPluralSnake, 'big_cities')
def create_data_model(className, *args, projectDirectory='.', dryrun=None, force=None, project=None): """Generates domain class from the parameters derived from :class:`.BaseData` Fails if the target file already exists unless ``force=True`` or ``--force`` in CLI is set. Parameters ---------- className : string (CamelCase) Name of the class to be created projectDirectory : string (default='.') Location of the project directory, the code will be created in ``projectDirectory/data_models/class_name.py``. dryrun : bool (default=None) If set to ``True`` it returns the generated code as a string force : bool (default=None) If set to ``True`` it overwrites the target file *args : List of strings member variables string representation of the member variable in "name:type", "name:List[type]" or "name:type:id" format Returns ------- content : string The generated code if ``dryrun`` is specified """ if project is None: project = HypergolProject(projectDirectory=projectDirectory, dryrun=dryrun, force=force) dataModel = DataModel(className=NameString(className), project=project) for value in args: dataModel.process_inputs(value) temporalDependencies = sorted( list({m.type_ for m in dataModel.conversions if m.isTemporal})) dataModelDependencies = [{ 'snake': m.type_.asSnake, 'name': m.type_ } for m in dataModel.conversions if not m.isTemporal and not m.isObject] content = ( DataModelRenderer().add( 'from typing import List ', dataModel.isListDependent).add( 'from datetime import {0} ', temporalDependencies).add( ' ', dataModel.isListDependent or len(temporalDependencies) > 0).add('from hypergol import BaseData ').add( ' ', len(dataModelDependencies) > 0).add( 'from data_models.{snake} import {name}', dataModelDependencies). add(' ').add( ' ').add( 'class {className}(BaseData): ', className=dataModel.className ).add(' ').add( ' def __init__(self, {arguments}): ', arguments=', '.join( dataModel.arguments)).add( ' self.{0} = {0} ', dataModel.names).add( ' ', len(dataModel.ids) > 0).add( ' def get_id(self): ', len(dataModel.ids) > 0).add( ' return ({idString}, ) ', len(dataModel.ids) > 0, idString=', '.join(dataModel.ids)). add(' ', len(dataModel.conversions) > 0).add( ' def to_data(self): ', len(dataModel.conversions) > 0).add( ' data = self.__dict__.copy() ', len(dataModel.conversions) > 0). add(" data['{name}'] = BaseData.to_string(data['{name}']) ", [{ 'name': m.name } for m in dataModel.conversions if m.isObject]). add(" data['{name}'] = data['{name}'].{conv}() ", [{ 'name': m.name, 'conv': m.to_ } for m in dataModel.conversions if not m.isList and not m.isObject]). add(" data['{name}'] = [v.{conv}() for v in data['{name}']] ", [{ 'name': m.name, 'conv': m.to_ } for m in dataModel.conversions if m.isList]). add(' return data ', len(dataModel.conversions) > 0). add(' ', len(dataModel.conversions) > 0). add(' @classmethod ', len(dataModel.conversions) > 0). add(' def from_data(cls, data): ', len(dataModel.conversions) > 0). add( " data['{name}'] = BaseData.from_string(data['{name}']) ", [{ 'name': m.name } for m in dataModel.conversions if m.isObject] ).add( " data['{name}'] = {type_}.{conv}(data['{name}']) ", [{ 'name': m.name, 'type_': str(m.type_), 'conv': m.from_ } for m in dataModel.conversions if not m.isList and not m.isObject] ).add( " data['{name}'] = [{type_}.{conv}(v) for v in data['{name}']] ", [{ 'name': m.name, 'type_': str(m.type_), 'conv': m.from_ } for m in dataModel.conversions if m.isList] ).add(' return cls(**data) ', len(dataModel.conversions) > 0)).get() project.create_text_file(content=content, filePath=Path(project.dataModelsPath, dataModel.className.asFileName)) project.render(templateName='test_data_models.py.j2', templateData={ 'name': dataModel.className, 'initialisations': ', '.join(dataModel.initialisations) }, filePath=Path(project.testsPath, f'test_{dataModel.className.asFileName}')) return project.cli_final_message(creationType='Class', name=dataModel.className, content=(content, ))
def create_project(projectName, dryrun=None, force=None): """Generates the project directories and files Fails if the target directory already exists unless ``force=True`` or ``--force`` in CLI is set. Directories: - ``data_models`` with ``__init__.py`` - ``pipelines`` with ``__init__.py`` - ``tasks`` with ``__init__.py`` - ``models`` with ``__init__.py`` - ``models\blocks`` with ``__init__.py`` - ``tests`` Executables: - ``make_venv.sh`` to create a virtual environment - ``run_tests.sh`` to run tests - ``run_pylint.sh`` to run linting Misc: - ``requirements.txt`` - ``.gitignore`` - ``README.md`` - ``LICENSE`` <- Don't forget to add current year and your name or change it to the one you want - ``pylintrc`` Parameters ---------- projectName : string (CamelCase) Name of the project to be created dryrun : bool (default=None) If set to ``True`` it returns the generated code as a string force : bool (default=None) If set to ``True`` it overwrites the target file """ projectName = NameString(projectName) project = HypergolProject(projectDirectory=projectName.asSnake, dryrun=dryrun, force=force) project.create_project_directory() project.create_data_models_directory() project.render_simple(templateName='__init__.py.j2', filePath=Path(project.dataModelsPath, '__init__.py')) project.create_tasks_directory() project.render_simple(templateName='__init__.py.j2', filePath=Path(project.tasksPath, '__init__.py')) project.create_pipelines_directory() project.render_simple(templateName='__init__.py.j2', filePath=Path(project.pipelinesPath, '__init__.py')) project.create_models_directory() project.render_simple(templateName='__init__.py.j2', filePath=Path(project.modelsPath, '__init__.py')) project.create_blocks_directory() project.render_simple(templateName='__init__.py.j2', filePath=Path(project.blocksPath, '__init__.py')) project.create_tests_directory() makeVenvScript = project.render_executable(templateName='make_venv.sh.j2', templateData={}, filePath=Path( project.projectDirectory, 'make_venv.sh')) runTestScript = project.render_executable(templateName='run_tests.sh.j2', templateData={}, filePath=Path( project.projectDirectory, 'run_tests.sh')) runPylintScript = project.render_executable( templateName='run_pylint.sh.j2', templateData={}, filePath=Path(project.projectDirectory, 'run_pylint.sh')) requirementsContent = project.render_simple( templateName='requirements.txt.j2', filePath=Path(project.projectDirectory, 'requirements.txt')) gitignoreContent = project.render_simple(templateName='.gitignore.j2', filePath=Path( project.projectDirectory, '.gitignore')) readmeContent = project.render_simple(templateName='README.md.j2', filePath=Path( project.projectDirectory, 'README.md')) licenseContent = project.render_simple(templateName='LICENSE.j2', filePath=Path( project.projectDirectory, 'LICENSE')) pylintrcContent = project.render_simple(templateName='pylintrc.j2', filePath=Path( project.projectDirectory, 'pylintrc')) allContent = (makeVenvScript, runTestScript, runPylintScript, requirementsContent, gitignoreContent, readmeContent, licenseContent, pylintrcContent) return project.cli_final_message(creationType='Project', name=projectName, content=allContent)
def create_pipeline(pipeLineName, *args, projectDirectory='.', dryrun=None, force=None, project=None): """Generates a pipeline script from the parameters Fails if the target file already exists unless ``force=True`` or ``--force`` in CLI is set. Generates pipe_line_name.py in pipelines, imports all the classes listed in ``*args`` and creates stubs for them to be filled. Also creates the executable ``pipe_line_name.sh`` in the project directory with examples how to pass parameters from the shell. Parameters ---------- pipeLineName : string (CamelCase) Name of the pipeline to be created projectDirectory : string (default='.') Location of the project directory, the code will be created in ``projectDirectory/data_models/class_name.py``. dryrun : bool (default=None) If set to ``True`` it returns the generated code as a string force : bool (default=None) If set to ``True`` it overwrites the target file *args : List of strings (CamelCase) Classes to be imported into the generated code from the data model, fails if class not found in either ``data_models`` or ``tasks`` Returns ------- content : string The generated code if ``dryrun`` is specified scriptContent : string The generated shell script to run the pipeline if ``dryrun`` is specified """ if project is None: project = HypergolProject(projectDirectory=projectDirectory, dryrun=dryrun, force=force) pipeLineName = NameString(pipeLineName) dependencies = [NameString(value) for value in args] project.check_dependencies(dependencies) content = project.render( templateName='pipeline.py.j2', templateData={ 'snakeName': pipeLineName.asSnake, 'taskDependencies': [name for name in dependencies if project.is_task_class(name)], 'dataModelDependencies': [ name for name in dependencies if project.is_data_model_class(name) ] }, filePath=Path(projectDirectory, 'pipelines', pipeLineName.asFileName)) scriptContent = project.render_executable( templateName='pipeline.sh.j2', templateData={'snakeName': pipeLineName.asSnake}, filePath=Path(projectDirectory, f'{pipeLineName.asSnake}.sh')) return project.cli_final_message(creationType='PipeLine', name=pipeLineName, content=(content, scriptContent))