def export(self, bucket, tag, retry): try: categories_path, spec_name, _ = spec_parse(tag) get_root_path() if not self._tag_exists(tag): return None, None except InvalidGitRepositoryError: log.error('You are not in an initialized ml-git repository.', class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local = LocalRepository( self.__config, get_objects_path(self.__config, self.__repo_type), self.__repo_type) local.export_tag(get_metadata_path(self.__config, self.__repo_type), tag, bucket, retry) self._checkout_ref()
def init_mlgit(): try: root_path = get_root_path() log.info('You already are in a ml-git repository (%s)' % (os.path.join(root_path, ROOT_FILE_NAME)), class_name=ADMIN_CLASS_NAME) return except Exception: pass try: os.mkdir('.ml-git') except PermissionError: log.error( 'Permission denied. You need write permission to initialize ml-git in this directory.', class_name=ADMIN_CLASS_NAME) return except FileExistsError: pass mlgit_config_save() root_path = get_root_path() log.info('Initialized empty ml-git repository in %s' % (os.path.join(root_path, ROOT_FILE_NAME)), class_name=ADMIN_CLASS_NAME)
def test_get_root_path(self): path = get_root_path() yaml_path_src = os.path.join(path, '.ml-git', 'config.yaml') yaml_path_dst = os.path.join(path, '.ml-git', 'coasdasdasnfig.ylma') os.rename(yaml_path_src, yaml_path_dst) self.assertRaises(RootPathException, lambda: get_root_path()) os.rename(yaml_path_dst, yaml_path_src)
def check_successfully_clone(project_dir, git_dir): try: os.chdir(project_dir) get_root_path() except RootPathException: clear(project_dir) log.error(output_messages['ERROR_MINIMAL_CONFIGURATION'], class_name=ADMIN_CLASS_NAME) clear(git_dir) return False return True
def check_successfully_clone(project_dir, git_dir): try: os.chdir(project_dir) get_root_path() except RootPathException: clear(project_dir) log.error('Wrong minimal configuration files!', class_name=ADMIN_CLASS_NAME) clear(git_dir) return False return True
def merged_config_load(hide_logs=False): try: get_root_path() global mlgit_config global_config = merge_conf(global_config_load(hide_logs), mlgit_config) local_config = mlgit_config_load(hide_logs) if not hide_logs: log.debug(output_messages['DEBUG_MERGING_LOCAL_AND_GLOBAL_CONFIG']) config_file = merge_conf(local_config, global_config) except RootPathException: config_file = global_config_load(hide_logs) return config_file
def clone_config_repository(url, folder, track): try: if get_root_path(): log.error('You are in initialized ml-git project.', class_name=ADMIN_CLASS_NAME) return False except RootPathException: pass git_dir = '.git' try: if folder is not None: project_dir = os.path.join(os.getcwd(), folder) ensure_path_exists(project_dir) else: project_dir = os.getcwd() if len(os.listdir(project_dir)) != 0: log.error( 'The path [%s] is not an empty directory. Consider using --folder to create an empty folder.' % project_dir, class_name=ADMIN_CLASS_NAME) return False Repo.clone_from(url, project_dir) except Exception as e: error_msg = str(e) if (e.__class__ == GitCommandError and 'Permission denied' in str( e.args[2])) or e.__class__ == PermissionError: error_msg = 'Permission denied in folder %s' % project_dir else: if folder is not None: clear(project_dir) if e.__class__ == GitCommandError: error_msg = 'Could not read from remote repository.' log.error(error_msg, class_name=ADMIN_CLASS_NAME) return False try: os.chdir(project_dir) get_root_path() except RootPathException: clear(project_dir) log.error('Wrong minimal configuration files!', class_name=ADMIN_CLASS_NAME) clear(git_dir) return False if not track: clear(os.path.join(project_dir, git_dir)) return True
def __init_manager(self, type_entity): try: get_root_path() config = config_load() if not config[type_entity]['git']: log.warn( output_messages['WARN_REPOSITORY_NOT_FOUND_FOR_ENTITY'] % type_entity, class_name=LocalEntityManager.__name__) return self._manager = MetadataManager(config, repo_type=type_entity) if not self._manager.check_exists(): self._manager.init() except Exception as e: log.error(e, class_name=LocalEntityManager.__name__)
def __get_log_files_path(): try: path = get_root_path() except RootPathException: path = os.getcwd() return os.path.join(path, LOG_FILES_PATH)
def garbage_collector(self): any_metadata = False removed_files = 0 reclaimed_space = 0 for entity in EntityType: repo_type = entity.value if self.metadata_exists(repo_type): log.info(output_messages['INFO_STARTING_GC'] % repo_type, class_name=REPOSITORY_CLASS_NAME) any_metadata = True index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) blobs_hashes = self._get_blobs_hashes(index_path, objects_path, repo_type) cache = Cache(get_cache_path(self.__config, repo_type)) count_removed_cache, reclaimed_cache_space = cache.garbage_collector( blobs_hashes) objects = Objects('', objects_path) count_removed_objects, reclaimed_objects_space = objects.garbage_collector( blobs_hashes) reclaimed_space += reclaimed_objects_space + reclaimed_cache_space removed_files += count_removed_objects + count_removed_cache if not any_metadata: log.error(output_messages['ERROR_UNINITIALIZED_METADATA'], class_name=REPOSITORY_CLASS_NAME) return log.info(output_messages['INFO_REMOVED_FILES'] % (humanize.intword(removed_files), os.path.join(get_root_path(), '.ml-git')), class_name=REPOSITORY_CLASS_NAME) log.info(output_messages['INFO_RECLAIMED_SPACE'] % humanize.naturalsize(reclaimed_space), class_name=REPOSITORY_CLASS_NAME)
def clone_config_repository(url, folder, track): try: if get_root_path(): log.error(output_messages['ERROR_IN_INTIALIZED_PROJECT'], class_name=ADMIN_CLASS_NAME) return False except RootPathException: pass git_dir = '.git' try: if folder is not None: project_dir = os.path.join(os.getcwd(), folder) ensure_path_exists(project_dir) else: project_dir = os.getcwd() if len(os.listdir(project_dir)) != 0: log.error(output_messages['ERROR_PATH_NOT_EMPTY'] % project_dir, class_name=ADMIN_CLASS_NAME) return False Repo.clone_from(url, project_dir) except Exception as e: error_msg = handle_clone_exception(e, folder, project_dir) log.error(error_msg, class_name=ADMIN_CLASS_NAME) return False if not check_successfully_clone(project_dir, git_dir): return False if not track: clear(os.path.join(project_dir, git_dir)) return True
def search_spec_file(repotype, spec, categories_path): root_path = get_root_path() dir_with_cat_path = os.path.join(root_path, repotype, categories_path, spec) dir_without_cat_path = os.path.join(root_path, repotype, spec) files = None dir_files = None try: files = os.listdir(dir_with_cat_path) dir_files = dir_with_cat_path except Exception: try: files = os.listdir(dir_without_cat_path) dir_files = dir_without_cat_path except Exception: # TODO: search '.' path as well # if 'files_without_cat_path' and 'files_with_cat_path' remains as None, the system couldn't find the directory # which means that the entity name passed is wrong if files is None: raise SearchSpecException('The entity name passed is wrong. Please check again') if len(files) > 0: for file in files: if spec in file: log.debug('search spec file: found [%s]-[%s]' % (dir_files, file), class_name=ML_GIT_PROJECT_NAME) return dir_files, file raise SearchSpecException('The entity name passed is wrong. Please check again')
def get_config_path(global_config=False): root_path = get_root_path() if global_config: file = get_global_config_path() else: file = os.path.join(root_path, CONFIG_FILE) return file
def search_spec_file(repotype, spec, root_path=None): if root_path is None: root_path = os.path.join(get_root_path(), repotype) spec_file = spec + SPEC_EXTENSION for root, dir, files in os.walk(root_path): if spec_file in files: return root, spec_file raise SearchSpecException(output_messages['ERROR_WRONG_NAME'])
def check_entity_exists(context, entity_type, entity_name): root_path = os.path.join(get_root_path(), entity_type) spec_file = entity_name + SPEC_EXTENSION for root, dir, files in os.walk(root_path): if spec_file in files: return root, spec_file log.error(output_messages['ERROR_WRONG_NAME']) context.exit()
def test_import_dir(self): root_path = get_root_path() src = os.path.join(root_path, 'hdata') dst = os.path.join(root_path, 'dst_dir') import_dir(src, dst) self.assertTrue(len(os.listdir(dst)) > 0) self.assertTrue(len(os.listdir(src)) > 0) shutil.rmtree(dst)
def checkout(entity, tag, sampling=None, retries=2, force=False, dataset=False, labels=False, version=-1): """This command allows retrieving the data of a specific version of an ML entity. Example: checkout('datasets', 'computer-vision__images3__imagenet__1') Args: entity (str): The type of an ML entity. (datasets, labels or models) tag (str): An ml-git tag to identify a specific version of an ML entity. sampling (dict): group: <amount>:<group> The group sample option consists of amount and group used to download a sample.\n range: <start:stop:step> The range sample option consists of start, stop and step used to download a sample. The start parameter can be equal or greater than zero. The stop parameter can be 'all', -1 or any integer above zero.\n random: <amount:frequency> The random sample option consists of amount and frequency used to download a sample. seed: The seed is used to initialize the pseudorandom numbers. retries (int, optional): Number of retries to download the files from the storage [default: 2]. force (bool, optional): Force checkout command to delete untracked/uncommitted files from the local repository [default: False]. dataset (bool, optional): If exist a dataset related with the model or labels, this one must be downloaded [default: False]. labels (bool, optional): If exist labels related with the model, they must be downloaded [default: False]. Returns: str: Return the path where the data was checked out. """ repo = get_repository_instance(entity) repo.update() if sampling is not None and not validate_sample(sampling): return None options = {} options['with_dataset'] = dataset options['with_labels'] = labels options['retry'] = retries options['force'] = force options['bare'] = False options['version'] = version repo.checkout(tag, sampling, options) spec_name = tag if re.search(RGX_TAG_FORMAT, tag): _, spec_name, _ = spec_parse(tag) spec_path, _ = search_spec_file(entity, spec_name) data_path = os.path.relpath(spec_path, get_root_path()) if not os.path.exists(data_path): data_path = None return data_path
def __get_conf_filepath(): models_path = os.getenv('MLMODELS_PATH') if models_path is None: models_path = get_key('mlgit_path') try: root_path = get_root_path() return os.path.join(root_path, os.sep.join([models_path, get_key('mlgit_conf')])) except Exception: return os.sep.join([models_path, get_key('mlgit_conf')])
def init_mlgit(): try: root_path = get_root_path() log.info(output_messages['INFO_ALREADY_IN_RESPOSITORY'], class_name=ADMIN_CLASS_NAME) return except Exception: pass try: os.mkdir('.ml-git') except PermissionError: log.error(output_messages['ERROR_PERMISSION_DENIED_INITIALIZE_DIRECTORY'], class_name=ADMIN_CLASS_NAME) return except FileExistsError: pass mlgit_config_save() root_path = get_root_path() log.info(output_messages['INFO_INITIALIZED_PROJECT_IN'] % (os.path.join(root_path, ROOT_FILE_NAME)), class_name=ADMIN_CLASS_NAME)
def update_store_spec(repotype, artefact_name, store_type, bucket): path = None try: path = get_root_path() except Exception as e: log.error(e, CLASS_NAME=ML_GIT_PROJECT_NAME) spec_path = os.path.join(path, repotype, artefact_name, artefact_name + '.spec') spec_hash = utils.yaml_load(spec_path) spec_hash[repotype]['manifest']['store'] = store_type + '://' + bucket utils.yaml_save(spec_hash, spec_path) return
def update_storage_spec(repo_type, artifact_name, storage_type, bucket, entity_dir=''): path = None try: path = get_root_path() except Exception as e: log.error(e, CLASS_NAME=ML_GIT_PROJECT_NAME) spec_path = os.path.join(path, repo_type, entity_dir, artifact_name, artifact_name + SPEC_EXTENSION) spec_hash = utils.yaml_load(spec_path) entity_spec_key = get_spec_key(repo_type) spec_hash[entity_spec_key]['manifest'][STORAGE_SPEC_KEY] = storage_type + '://' + bucket utils.yaml_save(spec_hash, spec_path) return
def create_workspace_tree_structure(repo_type, artifact_name, categories, storage_type, bucket_name, version, imported_dir, mutability, entity_dir=''): # get root path to create directories and files repo_type_dir = os.path.join(get_root_path(), repo_type) artifact_path = os.path.join(repo_type_dir, entity_dir, artifact_name) if not path_is_parent(repo_type_dir, artifact_path): raise Exception( output_messages['ERROR_INVALID_ENTITY_DIR'].format(entity_dir)) if os.path.exists(artifact_path): raise PermissionError(output_messages['INFO_ENTITY_NAME_EXISTS']) data_path = os.path.join(artifact_path, 'data') # import files from the directory passed if imported_dir is not None: import_dir(imported_dir, data_path) else: os.makedirs(data_path) spec_path = os.path.join(artifact_path, artifact_name + SPEC_EXTENSION) readme_path = os.path.join(artifact_path, 'README.md') file_exists = os.path.isfile(spec_path) storage = '%s://%s' % (storage_type, FAKE_STORAGE if bucket_name is None else bucket_name) entity_spec_key = get_spec_key(repo_type) spec_structure = { entity_spec_key: { 'categories': categories, 'manifest': { STORAGE_SPEC_KEY: storage }, 'name': artifact_name, 'mutability': mutability, 'version': version } } # write in spec file if not file_exists: yaml_save(spec_structure, spec_path) with open(readme_path, 'w'): pass return True else: return False
def test_create_workspace_tree_structure(self): root_path = get_root_path() IMPORT_PATH = os.path.join(os.getcwd(), 'test', 'src') os.makedirs(IMPORT_PATH) self.assertTrue(create_workspace_tree_structure(DATASETS, 'artefact_name', ['imgs', 'old', 'blue'], S3H, 'minio', 2, IMPORT_PATH, STRICT)) spec_path = os.path.join(os.getcwd(), os.sep.join([DATASETS, 'artefact_name', 'artefact_name.spec'])) spec1 = yaml_load(spec_path) self.assertEqual(spec1[DATASET_SPEC_KEY]['manifest'][STORAGE_SPEC_KEY], 's3h://minio') self.assertEqual(spec1[DATASET_SPEC_KEY]['name'], 'artefact_name') self.assertEqual(spec1[DATASET_SPEC_KEY]['mutability'], STRICT) self.assertEqual(spec1[DATASET_SPEC_KEY]['version'], 2) shutil.rmtree(IMPORT_PATH) shutil.rmtree(os.path.join(root_path, DATASETS))
def create(self, kwargs): artifact_name = kwargs['artifact_name'] categories = list(kwargs['category']) version = int(kwargs['version_number']) imported_dir = kwargs['import'] store_type = kwargs['store_type'] bucket_name = kwargs['bucket_name'] start_wizard = kwargs['wizard_config'] import_url = kwargs['import_url'] unzip_file = kwargs['unzip'] credentials_path = kwargs['credentials_path'] repo_type = self.__repo_type try: create_workspace_tree_structure(repo_type, artifact_name, categories, store_type, bucket_name, version, imported_dir, kwargs['mutability']) if start_wizard: has_new_store, store_type, bucket, profile, endpoint_url, git_repo = start_wizard_questions( repo_type) if has_new_store: store_add(store_type, bucket, profile, endpoint_url) update_store_spec(repo_type, artifact_name, store_type, bucket) remote_add(repo_type, git_repo) if import_url: self.create_config_store('gdrive', credentials_path) local = LocalRepository( self.__config, get_objects_path(self.__config, repo_type)) destine_path = os.path.join(repo_type, artifact_name, 'data') local.import_file_from_url(destine_path, import_url, StoreType.GDRIVE.value) if unzip_file: log.info('Unzipping files', CLASS_NAME=REPOSITORY_CLASS_NAME) data_path = os.path.join(get_root_path(), repo_type, artifact_name, 'data') unzip_files_in_directory(data_path) log.info("Project Created.", CLASS_NAME=REPOSITORY_CLASS_NAME) except Exception as e: if not isinstance(e, PermissionError): clear(os.path.join(repo_type, artifact_name)) if isinstance(e, KeyboardInterrupt): log.info("Create command aborted!", class_name=REPOSITORY_CLASS_NAME) else: log.error(e, CLASS_NAME=REPOSITORY_CLASS_NAME)
def __init__(self, git, path): try: root_path = get_root_path() self.__path = os.path.join(root_path, path) self.__git = git ensure_path_exists(self.__path) except RootPathException as e: log.error(e, class_name=METADATA_MANAGER_CLASS_NAME) raise e except Exception as e: if str( e ) == '\'Metadata\' object has no attribute \'_MetadataRepo__git\'': log.error('You are not in an initialized ml-git repository.', class_name=METADATA_MANAGER_CLASS_NAME) else: log.error(e, class_name=METADATA_MANAGER_CLASS_NAME) return
def __init__(self, git, path, repo_type): self.__repo_type = repo_type try: root_path = get_root_path() self.__path = os.path.join(root_path, path) self.__git = git ensure_path_exists(self.__path) except RootPathException as e: log.error(e, class_name=METADATA_MANAGER_CLASS_NAME) raise e except Exception as e: if str( e ) == '\'Metadata\' object has no attribute \'_MetadataRepo__git\'': log.error(output_messages['ERROR_NOT_IN_RESPOSITORY'], class_name=METADATA_MANAGER_CLASS_NAME) else: log.error(e, class_name=METADATA_MANAGER_CLASS_NAME) return
def test_create_workspace_tree_structure(self): root_path = get_root_path() IMPORT_PATH = os.path.join(os.getcwd(), 'test', 'src') os.makedirs(IMPORT_PATH) self.assertTrue( create_workspace_tree_structure('repotype', 'artefact_name', ['imgs', 'old', 'blue'], 's3h', 'minio', 2, IMPORT_PATH)) spec_path = os.path.join( os.getcwd(), os.sep.join(['repotype', 'artefact_name', 'artefact_name.spec'])) spec1 = yaml_load(spec_path) self.assertEqual(spec1['repotype']['manifest']['store'], 's3h://minio') self.assertEqual(spec1['repotype']['name'], 'artefact_name') self.assertEqual(spec1['repotype']['version'], 2) shutil.rmtree(IMPORT_PATH) shutil.rmtree(os.path.join(root_path, 'repotype'))
def get_log(self): log.debug('Loading log file', class_name=HASH_FS_CLASS_NAME) logs = [] try: root_path = get_root_path() log_path = os.path.join(root_path, self._logpath, 'store.log') except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) raise e if os.path.exists(log_path) is not True: return logs with open(log_path, 'r') as f: while True: line = f.readline().strip() if not line: break logs.append(line) return logs
def get_log(self): log.debug(output_messages['DEBUG_LOADING_LOG'], class_name=HASH_FS_CLASS_NAME) logs = [] try: root_path = get_root_path() log_path = os.path.join(root_path, self._logpath, STORAGE_LOG) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) raise e if os.path.exists(log_path) is not True: return logs with open(log_path, 'r') as f: while True: line = f.readline().strip() if not line: break logs.append(line) return logs
def create_workspace_tree_structure(repo_type, artifact_name, categories, store_type, bucket_name, version, imported_dir, mutability): # get root path to create directories and files path = get_root_path() artifact_path = os.path.join(path, repo_type, artifact_name) if os.path.exists(artifact_path): raise PermissionError('An entity with that name already exists.') data_path = os.path.join(artifact_path, 'data') # import files from the directory passed if imported_dir is not None: import_dir(imported_dir, data_path) else: os.makedirs(data_path) spec_path = os.path.join(artifact_path, artifact_name + SPEC_EXTENSION) readme_path = os.path.join(artifact_path, 'README.md') file_exists = os.path.isfile(spec_path) store = '%s://%s' % (store_type, FAKE_STORE if bucket_name is None else bucket_name) spec_structure = { repo_type: { 'categories': categories, 'manifest': { 'store': store }, 'name': artifact_name, 'mutability': mutability, 'version': version } } # write in spec file if not file_exists: yaml_save(spec_structure, spec_path) with open(readme_path, 'w'): pass return True else: return False