def fsck(self, exclude=['log', 'metadata'], remove_corrupted=False): log.info(output_messages['INFO_STARTING_INTEGRITY_CHECK'] % self._path, class_name=HASH_FS_CLASS_NAME) corrupted_files = [] corrupted_files_fullpaths = [] self._check_files_integrity(corrupted_files, corrupted_files_fullpaths) self._remove_corrupted_files(corrupted_files_fullpaths, remove_corrupted) return corrupted_files
def update(self): log.info(output_messages['INFO_MLGIT_PULL'] % self.__path, class_name=METADATA_MANAGER_CLASS_NAME) repo = Repo(self.__path) self.validate_blank_remote_url() o = repo.remotes.origin o.pull('--tags')
def checkout(self, tag, samples, options): try: metadata_path = get_metadata_path(self.__config) except RootPathException as e: log.warn(e, class_name=REPOSITORY_CLASS_NAME) metadata_path = self._initialize_repository_on_the_fly() dt_tag, lb_tag = self._checkout(tag, samples, options) options['with_dataset'] = False options['with_labels'] = False if dt_tag is not None: try: self.__repo_type = 'dataset' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related dataset download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(dt_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME) if lb_tag is not None: try: self.__repo_type = 'labels' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related labels download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(lb_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME)
def key_exists(self, key_path): file_info = self.get_file_info_by_name(key_path) if file_info: if file_info.get('trashed'): log.info('File [{}] located in trash.'.format(key_path)) return True return False
def status(self, spec, full_option, status_directory): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) log.info('%s: status of ml-git index for [%s]' % (repo_type, spec), class_name=REPOSITORY_CLASS_NAME) new_files, deleted_files, untracked_files, corruped_files, changed_files = repo.status( spec, status_directory) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if new_files is not None and deleted_files is not None and untracked_files is not None: print('Changes to be committed:') self._print_files(new_files, full_option, 'New file: ') self._print_files(deleted_files, full_option, 'Deleted: ') print('\nUntracked files:') self._print_files(untracked_files, full_option) print('\nCorrupted files:') self._print_files(corruped_files, full_option) if changed_files and len(changed_files) > 0: print('\nChanges not staged for commit:') self._print_files(changed_files, full_option)
def init_mlgit(): try: root_path = get_root_path() log.info('You already are in a ml-git repository (%s)' % (os.path.join(root_path, ROOT_FILE_NAME)), class_name=ADMIN_CLASS_NAME) return except Exception: pass try: os.mkdir('.ml-git') except PermissionError: log.error( 'Permission denied. You need write permission to initialize ml-git in this directory.', class_name=ADMIN_CLASS_NAME) return except FileExistsError: pass mlgit_config_save() root_path = get_root_path() log.info('Initialized empty ml-git repository in %s' % (os.path.join(root_path, ROOT_FILE_NAME)), class_name=ADMIN_CLASS_NAME)
def storage_del(storage_type, bucket, global_conf=False): if not valid_storage_type(storage_type): return try: config_path = get_config_path(global_conf) conf = yaml_load(config_path) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return storage_exists = STORAGE_CONFIG_KEY in conf and storage_type in conf[ STORAGE_CONFIG_KEY] and bucket in conf[STORAGE_CONFIG_KEY][storage_type] if not storage_exists: log.warn(output_messages['WARN_STORAGE_NOT_IN_CONFIG'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) return del conf[STORAGE_CONFIG_KEY][storage_type][bucket] log.info(output_messages['INFO_REMOVED_STORAGE'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) yaml_save(conf, config_path)
def _add_associate_entity_metadata(self, metadata, specs): dataset = EntityType.DATASETS.value labels = EntityType.LABELS.value model = EntityType.MODELS.value entity_spec_key = get_spec_key(self.__repo_type) if dataset in specs and self.__repo_type in [labels, model]: d_spec = specs[dataset] refs_path = get_refs_path(self.__config, dataset) r = Refs(refs_path, d_spec, dataset) tag, sha = r.head() if tag is not None: log.info(output_messages['INFO_ASSOCIATE_DATASETS'] % (d_spec, tag, self.__repo_type), class_name=LOCAL_REPOSITORY_CLASS_NAME) metadata[entity_spec_key][DATASET_SPEC_KEY] = {} metadata[entity_spec_key][DATASET_SPEC_KEY]['tag'] = tag metadata[entity_spec_key][DATASET_SPEC_KEY]['sha'] = sha if labels in specs and self.__repo_type in [model]: l_spec = specs[labels] refs_path = get_refs_path(self.__config, labels) r = Refs(refs_path, l_spec, labels) tag, sha = r.head() if tag is not None: log.info('Associate labels [%s]-[%s] to the %s.' % (l_spec, tag, self.__repo_type), class_name=LOCAL_REPOSITORY_CLASS_NAME) metadata[entity_spec_key][LABELS_SPEC_KEY] = {} metadata[entity_spec_key][LABELS_SPEC_KEY]['tag'] = tag metadata[entity_spec_key][LABELS_SPEC_KEY]['sha'] = sha
def _create_new_bucket(bucket_name): storage_type = click.prompt(output_messages['INFO_DEFINE_STORAGE_TYPE'], default=StorageType.S3H.value, show_default=True, type=click.Choice( MultihashStorageType.to_list()), show_choices=True) if bucket_name is None: bucket_name = _get_user_input( output_messages['INFO_DEFINE_WIZARD_MESSAGE'].format( 'storage name'), required=True) from ml_git.commands.storage import storage_add storage_add(None, wizard=True, type=storage_type, bucket_name=bucket_name, credentials=None, region=None, endpoint_url=None, username=None, private_key=None, port=None) if storage_type == StorageType.AZUREBLOBH.value: log.info(output_messages['INFO_CONFIGURE_AZURE']) return storage_type, bucket_name
def store_del(store_type, bucket, global_conf=False): if not valid_store_type(store_type): return try: config_path = get_config_path(global_conf) conf = yaml_load(config_path) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return store_exists = 'store' in conf and store_type in conf[ 'store'] and bucket in conf['store'][store_type] if not store_exists: log.warn('Store [%s://%s] not found in configuration file.' % (store_type, bucket), class_name=ADMIN_CLASS_NAME) return del conf['store'][store_type][bucket] log.info('Removed store [%s://%s] from configuration file.' % (store_type, bucket), class_name=ADMIN_CLASS_NAME) yaml_save(conf, config_path)
def create_relationships_csv_file(csv_header, file_name, formatted_data, dir, export_path=False): file_path = os.path.join(dir, file_name) create_csv_file(file_path, csv_header, formatted_data) if export_path: log.info('A CSV file was created with the relationships information in {}'.format(file_path)) with open(file_path) as csv_file: return io.StringIO(csv_file.read())
def log(self, spec, stat=False, fullstat=False): try: repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) metadata = Metadata(spec, metadata_path, self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) log_info = metadata.get_log_info(spec, fullstat) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return fidx = FullIndex(spec, index_path) if stat or fullstat: workspace_size = fidx.get_total_size() amount_message = 'Total of files: %s' % fidx.get_total_count() size_message = 'Workspace size: %s' % humanize.naturalsize( workspace_size) workspace_info = '------------------------------------------------- \n{}\t{}' \ .format(amount_message, size_message) log_info = '{}\n{}'.format(log_info, workspace_info) log.info(log_info, class_name=REPOSITORY_CLASS_NAME)
def _add_associate_entity_metadata(self, metadata, specs): if 'dataset' in specs and self.__repo_type in ['labels', 'model']: d_spec = specs['dataset'] refs_path = get_refs_path(self.__config, 'dataset') r = Refs(refs_path, d_spec, 'dataset') tag, sha = r.head() if tag is not None: log.info('Associate dataset [%s]-[%s] to the %s.' % (d_spec, tag, self.__repo_type), class_name=LOCAL_REPOSITORY_CLASS_NAME) metadata[self.__repo_type]['dataset'] = {} metadata[self.__repo_type]['dataset']['tag'] = tag metadata[self.__repo_type]['dataset']['sha'] = sha if 'labels' in specs and self.__repo_type in ['model']: l_spec = specs['labels'] refs_path = get_refs_path(self.__config, 'labels') r = Refs(refs_path, l_spec, 'labels') tag, sha = r.head() if tag is not None: log.info('Associate labels [%s]-[%s] to the %s.' % (l_spec, tag, self.__repo_type), class_name=LOCAL_REPOSITORY_CLASS_NAME) metadata[self.__repo_type]['labels'] = {} metadata[self.__repo_type]['labels']['tag'] = tag metadata[self.__repo_type]['labels']['sha'] = sha
def update(self): log.info('Pull [%s]' % self.__path, class_name=METADATA_MANAGER_CLASS_NAME) repo = Repo(self.__path) self.validate_blank_remote_url() o = repo.remotes.origin o.pull('--tags')
def unlock_file(self, spec, file_path): repo_type = self.__repo_type if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) spec_file = yaml_load(spec_path) try: mutability = spec_file[repo_type]['mutability'] if mutability not in Mutability.list(): log.error('Invalid mutability type.', class_name=REPOSITORY_CLASS_NAME) return except Exception: log.info( 'The spec does not have the \'mutability\' property set. Default: strict.', class_name=REPOSITORY_CLASS_NAME) return if mutability != Mutability.STRICT.value: try: local = LocalRepository(self.__config, objects_path, repo_type) local.unlock_file(path, file_path, index_path, objects_path, spec, cache_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return else: log.error( 'You cannot use this command for this entity because mutability cannot be strict.', class_name=REPOSITORY_CLASS_NAME)
def get(self, objectkey, path, file): log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile)
def fsck(self, exclude=['log', 'metadata'], remove_corrupted=False): log.info('Starting integrity check on [%s]' % self._path, class_name=HASH_FS_CLASS_NAME) corrupted_files = [] corrupted_files_fullpaths = [] self._check_files_integrity(corrupted_files, corrupted_files_fullpaths) self._remove_corrupted_files(corrupted_files_fullpaths, remove_corrupted) return corrupted_files
def get(self, objectkey, path, file): log.info(output_messages['INFO_GETTING_FILE'] % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile)
def _remove_corrupted_files(self, corrupted_files_fullpaths, remove_corrupted): if remove_corrupted and len(corrupted_files_fullpaths) > 0: log.info(output_messages['INFO_REMOVING_CORRUPTED_FILES'] % len(corrupted_files_fullpaths), class_name=HASH_FS_CLASS_NAME) self.__progress_bar = tqdm(total=len(corrupted_files_fullpaths), desc='files', unit='files', unit_scale=True, mininterval=1.0) for cor_file_fullpath in corrupted_files_fullpaths: log.debug(output_messages['DEBUG_REMOVING_FILE'] % cor_file_fullpath, class_name=HASH_FS_CLASS_NAME) os.unlink(cor_file_fullpath) self.__progress_bar.update(1) self.__progress_bar.close()
def storage_add(storage_type, bucket, credentials_profile, global_conf=False, endpoint_url=None, sftp_configs=None): if not valid_storage_type(storage_type): return try: region = get_bucket_region(bucket, credentials_profile) except Exception: region = None if storage_type not in (StorageType.S3H.value, StorageType.S3.value ) or credentials_profile is None: log.info(output_messages['INFO_ADD_STORAGE_WITHOUT_PROFILE'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) else: log.info(output_messages['INFO_ADD_STORAGE'] % (storage_type, bucket, credentials_profile), class_name=ADMIN_CLASS_NAME) try: file = get_config_path(global_conf) conf = yaml_load(file) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return if STORAGE_CONFIG_KEY not in conf: conf[STORAGE_CONFIG_KEY] = {} if storage_type not in conf[STORAGE_CONFIG_KEY]: conf[STORAGE_CONFIG_KEY][storage_type] = {} conf[STORAGE_CONFIG_KEY][storage_type][bucket] = {} if storage_type in [StorageType.S3.value, StorageType.S3H.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket]['aws-credentials'] = {} conf[STORAGE_CONFIG_KEY][storage_type][bucket]['aws-credentials'][ 'profile'] = credentials_profile conf[STORAGE_CONFIG_KEY][storage_type][bucket]['region'] = region conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'endpoint-url'] = endpoint_url elif storage_type in [StorageType.GDRIVEH.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'credentials-path'] = credentials_profile elif storage_type in [StorageType.SFTPH.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'endpoint-url'] = endpoint_url conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'username'] = sftp_configs['username'] conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'private-key'] = sftp_configs['private_key'] conf[STORAGE_CONFIG_KEY][storage_type][bucket]['port'] = sftp_configs[ 'port'] yaml_save(conf, file)
def _has_new_data(self, repo, spec): _, deleted, untracked_files, _, changed_files = repo.status( spec, status_directory='', log_errors=False) if deleted is None and untracked_files is None and changed_files is None: return False elif len(deleted) == 0 and len(untracked_files) == 0 and len( changed_files) == 0: log.info('There is no new data to add', class_name=REPOSITORY_CLASS_NAME) return False return True
def export_relationships_to_dot(entities, relationships, export_path): dot_data = __format_relationships_to_dot(entities, relationships) file_name = __get_file_name(entities, FileType.DOT.value) if export_path: file_path = os.path.join(export_path, file_name) with open(file_path, 'w') as out: out.write(dot_data) log.info('A DOT file was created with the relationship information in {}'.format(file_path)) return return dot_data
def request_new_value(input_message, required=False, input_type=None): default = EMPTY_FOR_NONE if required: default = None field_value = click.prompt(input_message, default=default, show_default=False, type=input_type) log.debug('{}: {}'.format(input_message, field_value)) if type(field_value) == str: field_value = field_value.strip() if not field_value and required: log.info(output_messages['ERROR_EMPTY_VALUE']) field_value = request_new_value(input_message, required, input_type) return field_value
def get(self, categories, model_name, file=None): if file is None: full_path = os.path.join(self.__path, os.sep.join(categories), model_name, model_name) else: full_path = os.path.join(self.__path, os.sep.join(categories), model_name, file) log.info('Metadata GET %s' % full_path, class_name=METADATA_MANAGER_CLASS_NAME) if os.path.exists(full_path): return yaml_load(full_path) return None
def commit_metadata(self, index_path, tags, commit_msg, changed_files, mutability, ws_path): spec_file = os.path.join(index_path, 'metadata', self._spec, self._spec + SPEC_EXTENSION) full_metadata_path, categories_sub_path, metadata = self._full_metadata_path( spec_file) log.debug('Metadata path [%s]' % full_metadata_path, class_name=METADATA_CLASS_NAME) if full_metadata_path is None: return None, None elif categories_sub_path is None: return None, None ensure_path_exists(full_metadata_path) ret = self.__commit_manifest(full_metadata_path, index_path, changed_files, mutability) if ret is False: log.info('No files to commit for [%s]' % self._spec, class_name=METADATA_CLASS_NAME) return None, None try: self.__commit_metadata(full_metadata_path, index_path, metadata, tags, ws_path) except Exception: return None, None # generates a tag to associate to the commit tag = self.metadata_tag(metadata) # check if tag already exists in the ml-git repository tags = self._tag_exists(tag) if len(tags) > 0: log.error( 'Tag [%s] already exists in the ml-git repository. ' 'Consider using --bumpversion parameter to increment the version number for your dataset.' % tag, class_name=METADATA_CLASS_NAME) for t in tags: log.error('\t%s' % t) return None, None if commit_msg is not None and len(commit_msg) > 0: msg = commit_msg else: # generates a commit message msg = self.metadata_message(metadata) log.debug('Commit message [%s]' % msg, class_name=METADATA_CLASS_NAME) sha = self.commit(categories_sub_path, msg) self.tag_add(tag) return str(tag), str(sha)
def format_storages(storages): """Augment existing storage information with server type, etc.""" try: for bucket_type, buckets in storages.items(): for bucket_name, bucket in buckets.items(): bucket['name'] = bucket_name bucket['type'] = bucket_type bucket['subtype'] = '' if bucket_type in ('s3', 's3h'): bucket['subtype'] = 'minio' if is_minio_storage(bucket) else 'aws' except Exception: log.info('Error augmenting storage information') return storages
def commit_metadata(self, index_path, tags, commit_msg, changed_files, mutability, ws_path): spec_file = os.path.join(index_path, 'metadata', self._spec, self._spec + SPEC_EXTENSION) full_metadata_path, entity_sub_path, metadata = self._full_metadata_path( spec_file) log.debug(output_messages['DEBUG_METADATA_PATH'] % full_metadata_path, class_name=METADATA_CLASS_NAME) if full_metadata_path is None: return None, None elif entity_sub_path is None: return None, None ensure_path_exists(full_metadata_path) ret = self.__commit_manifest(full_metadata_path, index_path, changed_files, mutability) if ret is False: log.info(output_messages['INFO_NO_FILES_COMMIT_FOR'] % self._spec, class_name=METADATA_CLASS_NAME) return None, None try: self.__commit_metadata(full_metadata_path, index_path, metadata, tags, ws_path) except Exception: return None, None # generates a tag to associate to the commit tag = self.metadata_tag(metadata) # check if tag already exists in the ml-git repository tags = self._tag_exists(tag) if len(tags) > 0: log.error(output_messages[ 'ERROR_TAG_ALREADY_EXISTS_CONSIDER_USER_VERSION'] % tag, class_name=METADATA_CLASS_NAME) for t in tags: log.error(output_messages['ERROR_METADATA_MESSAGE'] % t) return None, None if commit_msg is not None and len(commit_msg) > 0: msg = commit_msg else: # generates a commit message msg = self.metadata_message(metadata) log.debug(output_messages['DEBUG_COMMIT_MESSAGE'] % msg, class_name=METADATA_CLASS_NAME) sha = self.commit(entity_sub_path, msg) self.tag_add(tag) return str(tag), str(sha)
def update_project(v1_dataset_path_exists, v1_model_path_exists, root_path): log.info(output_messages['INFO_UPDATE_THE_PROJECT']) update_now = input( output_messages['INFO_AKS_IF_WANT_UPDATE_PROJECT']).lower() if update_now in ['yes', 'y']: if v1_dataset_path_exists: update_directories_to_plural(root_path, V1_DATASETS_KEY, EntityType.DATASETS.value) if v1_model_path_exists: update_directories_to_plural(root_path, V1_MODELS_KEY, EntityType.MODELS.value) change_keys_in_config(root_path) else: raise Exception(output_messages['ERROR_PROJECT_NEED_BE_UPDATED'])
def remote_del(repo_type, global_conf=False): file = get_config_path(global_conf) conf = yaml_load(file) if repo_type in conf: git_url = conf[repo_type]['git'] if git_url is None or not len(conf[repo_type]['git']) > 0: log.error(output_messages['ERROR_REMOTE_UNCONFIGURED'] % repo_type, class_name=ADMIN_CLASS_NAME) else: log.info(output_messages['INFO_REMOVE_REMOTE'] % (git_url, repo_type), class_name=ADMIN_CLASS_NAME) conf[repo_type]['git'] = '' yaml_save(conf, file) else: log.error(output_messages['ERROR_ENTITY_NOT_FOUND'] % repo_type, class_name=ADMIN_CLASS_NAME)
def _initialize_repository_on_the_fly(self): if os.path.exists(get_global_config_path()): log.info('Initializing the project with global settings', class_name=REPOSITORY_CLASS_NAME) init_mlgit() save_global_config_in_local() metadata_path = get_metadata_path(self.__config) if not os.path.exists(metadata_path): Metadata('', metadata_path, self.__config, self.__repo_type).init() return metadata_path raise RootPathException( 'You are not in an initialized ml-git repository and do not have a global configuration.' )