def repo_remote_del(self, global_conf=False): try: metadata_path = get_metadata_path(self.__config) metadata = Metadata('', metadata_path, self.__config, self.__repo_type) if metadata.delete_git_reference(): remote_del(self.__repo_type, global_conf) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def get(self, file_path, reference): file_info = self.get_file_info_by_name(reference) if not file_info: log.error(output_messages['ERROR_NOT_FOUND'] % reference, class_name=GDRIVE_STORAGE) return False self.download_file(file_path, file_info) return True
def repo_remote_add(self, repo_type, mlgit_remote, global_conf=False): try: remote_add(repo_type, mlgit_remote, global_conf) self.__config = config_load() metadata_path = get_metadata_path(self.__config) m = Metadata('', metadata_path, self.__config, self.__repo_type) m.remote_set_url(mlgit_remote) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def _check_integrity(self, cid, data): cid0 = self._digest(data) if cid == cid0: log.debug('Checksum verified for chunk [%s]' % cid, class_name=HASH_FS_CLASS_NAME) return True log.error('Corruption detected for chunk [%s] - got [%s]' % (cid, cid0), class_name=HASH_FS_CLASS_NAME) return False
def incr_version(file, repotype='dataset'): spec_hash = utils.yaml_load(file) if is_valid_version(spec_hash, repotype): spec_hash[repotype]['version'] += 1 utils.yaml_save(spec_hash, file) log.debug('Version incremented to %s.' % spec_hash[repotype]['version'], class_name=ML_GIT_PROJECT_NAME) return spec_hash[repotype]['version'] else: log.error('Invalid version, could not increment. File:\n %s' % file, class_name=ML_GIT_PROJECT_NAME) return -1
def check_integrity(self, cid, ncid): # cid0 = self.digest(data) if cid == ncid: log.debug('Checksum verified for chunk [%s]' % cid, class_name=MULTI_HASH_STORE_NAME) return True log.error('Corruption detected for chunk [%s] - got [%s]' % (cid, ncid), class_name=MULTI_HASH_STORE_NAME) return False
def check_successfully_clone(project_dir, git_dir): try: os.chdir(project_dir) get_root_path() except RootPathException: clear(project_dir) log.error(output_messages['ERROR_MINIMAL_CONFIGURATION'], class_name=ADMIN_CLASS_NAME) clear(git_dir) return False return True
def _copy_to_metadata_path(self, src_path, full_metadata_path, file_name): if os.path.exists(src_path): dst_path = os.path.join(full_metadata_path, file_name) try: shutil.copy2(src_path, dst_path) except Exception as e: log.error(output_messages['ERROR_COULD_NOT_FIND_FILE'] % file_name, class_name=METADATA_CLASS_NAME) raise e
def list_tag(self, spec): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata(spec, metadata_path, self.__config, repo_type) for tag in m.list_tags(spec): print(tag) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def check_successfully_clone(project_dir, git_dir): try: os.chdir(project_dir) get_root_path() except RootPathException: clear(project_dir) log.error('Wrong minimal configuration files!', class_name=ADMIN_CLASS_NAME) clear(git_dir) return False return True
def validate_sample(sampling): if 'group' in sampling or 'random' in sampling: if 'seed' not in sampling: log.error('It is necessary to pass the attribute \'seed\' in \'sampling\'. Example: {\'group\': \'1:2\', ' '\'seed\': \'10\'}.') return False elif 'range' not in sampling: log.error('To use the sampling option, you must pass a valid type of sampling (group, ' 'random or range).') return False return True
def incr_version(file, repo_type=DATASETS): spec_hash = utils.yaml_load(file) entity_spec_key = get_spec_key(repo_type) if is_valid_version(spec_hash, entity_spec_key): spec_hash[entity_spec_key]['version'] += 1 utils.yaml_save(spec_hash, file) log.debug(output_messages['DEBUG_VERSION_INCREMENTED_TO'] % spec_hash[entity_spec_key]['version'], class_name=ML_GIT_PROJECT_NAME) return spec_hash[entity_spec_key]['version'] else: log.error(output_messages['ERROR_INVALID_VERSION_INCREMENT'] % file, class_name=ML_GIT_PROJECT_NAME) return -1
def storage_add(storage_type, bucket, credentials_profile, global_conf=False, endpoint_url=None, sftp_configs=None): if not valid_storage_type(storage_type): return try: region = get_bucket_region(bucket, credentials_profile) except Exception: region = None if storage_type not in (StorageType.S3H.value, StorageType.S3.value ) or credentials_profile is None: log.info(output_messages['INFO_ADD_STORAGE_WITHOUT_PROFILE'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) else: log.info(output_messages['INFO_ADD_STORAGE'] % (storage_type, bucket, credentials_profile), class_name=ADMIN_CLASS_NAME) try: file = get_config_path(global_conf) conf = yaml_load(file) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return if STORAGE_CONFIG_KEY not in conf: conf[STORAGE_CONFIG_KEY] = {} if storage_type not in conf[STORAGE_CONFIG_KEY]: conf[STORAGE_CONFIG_KEY][storage_type] = {} conf[STORAGE_CONFIG_KEY][storage_type][bucket] = {} if storage_type in [StorageType.S3.value, StorageType.S3H.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket]['aws-credentials'] = {} conf[STORAGE_CONFIG_KEY][storage_type][bucket]['aws-credentials'][ 'profile'] = credentials_profile conf[STORAGE_CONFIG_KEY][storage_type][bucket]['region'] = region conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'endpoint-url'] = endpoint_url elif storage_type in [StorageType.GDRIVEH.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'credentials-path'] = credentials_profile elif storage_type in [StorageType.SFTPH.value]: conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'endpoint-url'] = endpoint_url conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'username'] = sftp_configs['username'] conf[STORAGE_CONFIG_KEY][storage_type][bucket][ 'private-key'] = sftp_configs['private_key'] conf[STORAGE_CONFIG_KEY][storage_type][bucket]['port'] = sftp_configs[ 'port'] yaml_save(conf, file)
def _fetch(self, tag, samples, retries=2, bare=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) # check if no data left untracked/uncommitted. othrewise, stop. local_rep = LocalRepository(self.__config, objects_path, repo_type) return local_rep.fetch(metadata_path, tag, samples, retries, bare) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def _is_valid_hashpath(self, path, file): """ Checks if the file is placed in a valid directory following the structure created in the _get_hashpath method """ hashpath = self._get_hashpath(file) actual_fullpath = os.path.join(path, file) is_valid = hashpath.lower() == actual_fullpath.lower() if not is_valid: log.error(output_messages['ERROR_CHUNK_WRONG_DIRECTORY'] % (hashpath, actual_fullpath), class_name=HASH_FS_CLASS_NAME) return is_valid
def update_store_spec(repotype, artefact_name, store_type, bucket): path = None try: path = get_root_path() except Exception as e: log.error(e, CLASS_NAME=ML_GIT_PROJECT_NAME) spec_path = os.path.join(path, repotype, artefact_name, artefact_name + '.spec') spec_hash = utils.yaml_load(spec_path) spec_hash[repotype]['manifest']['store'] = store_type + '://' + bucket utils.yaml_save(spec_hash, spec_path) return
def bucket_exists(self): try: self._storage.meta.client.head_bucket(Bucket=self._bucket) return True except ClientError as e: error_msg = e.response['Error']['Message'] if e.response['Error']['Code'] == '404': error_msg = output_messages['ERROR_BUCKET_DOES_NOT_EXIST'] % self._bucket elif e.response['Error']['Code'] == '403': error_msg = output_messages['ERROR_AWS_KEY_NOT_EXIST'] log.error(error_msg, class_name=STORAGE_FACTORY_CLASS_NAME) return False
def get(self, file_path, reference): try: blob_client = self._storage.get_blob_client(container=self._bucket, blob=reference) with open(file_path, 'wb') as download_file: data = blob_client.download_blob().readall() download_file.write(data) if not self.check_integrity(reference, self.digest(data)): return False except Exception as e: log.error(e, class_name=AZURE_STORAGE_NAME) return False return True
def commit_metadata(self, index_path, tags, commit_msg, changed_files, mutability, ws_path): spec_file = os.path.join(index_path, 'metadata', self._spec, self._spec + SPEC_EXTENSION) full_metadata_path, categories_sub_path, metadata = self._full_metadata_path( spec_file) log.debug('Metadata path [%s]' % full_metadata_path, class_name=METADATA_CLASS_NAME) if full_metadata_path is None: return None, None elif categories_sub_path is None: return None, None ensure_path_exists(full_metadata_path) ret = self.__commit_manifest(full_metadata_path, index_path, changed_files, mutability) if ret is False: log.info('No files to commit for [%s]' % self._spec, class_name=METADATA_CLASS_NAME) return None, None try: self.__commit_metadata(full_metadata_path, index_path, metadata, tags, ws_path) except Exception: return None, None # generates a tag to associate to the commit tag = self.metadata_tag(metadata) # check if tag already exists in the ml-git repository tags = self._tag_exists(tag) if len(tags) > 0: log.error( 'Tag [%s] already exists in the ml-git repository. ' 'Consider using --bumpversion parameter to increment the version number for your dataset.' % tag, class_name=METADATA_CLASS_NAME) for t in tags: log.error('\t%s' % t) return None, None if commit_msg is not None and len(commit_msg) > 0: msg = commit_msg else: # generates a commit message msg = self.metadata_message(metadata) log.debug('Commit message [%s]' % msg, class_name=METADATA_CLASS_NAME) sha = self.commit(categories_sub_path, msg) self.tag_add(tag) return str(tag), str(sha)
def _is_spec_valid(self, spec_path): spec_file = yaml_load(spec_path) if not validate_spec_hash(spec_file, self.__repo_type): log.error( 'Invalid %s spec in %s. It should look something like this:\n%s' % (self.__repo_type, spec_path, get_sample_spec_doc('somebucket', self.__repo_type)), class_name=REPOSITORY_CLASS_NAME) return False if not validate_bucket_name(spec_file[self.__repo_type], self.__config): return False return True
def get_account(self): connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING') if connection_string is not None: return connection_string try: azure_folder = os.path.expanduser(os.path.join('~', '.azure')) config = toml.load(os.path.join(azure_folder, 'config')) connection = config[STORAGE_SPEC_KEY]['connection_string'] if connection != '': return connection except Exception: log.debug(output_messages['DEBUG_AZURE_CLI_NOT_FIND'], class_name=AZURE_STORAGE_NAME) log.error(output_messages['ERROR_AZURE_CREDENTIALS_NOT_FOUND'], class_name=AZURE_STORAGE_NAME)
def update(self): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata('', metadata_path, self.__config, repo_type) m.update() except GitError as error: log.error( 'Could not update metadata. Check your remote configuration. %s' % error.stderr, class_name=REPOSITORY_CLASS_NAME) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME)
def _check_corrupted_files(self, spec, repo): try: corrupted_files = repo.get_corrupted_files(spec) if corrupted_files is not None and len(corrupted_files) > 0: print('\n') log.warn( 'The following files cannot be added because they are corrupted:', class_name=REPOSITORY_CLASS_NAME) for file in corrupted_files: print('\t %s' % file) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def update_storage_spec(repo_type, artifact_name, storage_type, bucket, entity_dir=''): path = None try: path = get_root_path() except Exception as e: log.error(e, CLASS_NAME=ML_GIT_PROJECT_NAME) spec_path = os.path.join(path, repo_type, entity_dir, artifact_name, artifact_name + SPEC_EXTENSION) spec_hash = utils.yaml_load(spec_path) entity_spec_key = get_spec_key(repo_type) spec_hash[entity_spec_key]['manifest'][STORAGE_SPEC_KEY] = storage_type + '://' + bucket utils.yaml_save(spec_hash, spec_path) return
def commit_metadata(self, index_path, tags, commit_msg, changed_files, mutability, ws_path): spec_file = os.path.join(index_path, 'metadata', self._spec, self._spec + SPEC_EXTENSION) full_metadata_path, entity_sub_path, metadata = self._full_metadata_path( spec_file) log.debug(output_messages['DEBUG_METADATA_PATH'] % full_metadata_path, class_name=METADATA_CLASS_NAME) if full_metadata_path is None: return None, None elif entity_sub_path is None: return None, None ensure_path_exists(full_metadata_path) ret = self.__commit_manifest(full_metadata_path, index_path, changed_files, mutability) if ret is False: log.info(output_messages['INFO_NO_FILES_COMMIT_FOR'] % self._spec, class_name=METADATA_CLASS_NAME) return None, None try: self.__commit_metadata(full_metadata_path, index_path, metadata, tags, ws_path) except Exception: return None, None # generates a tag to associate to the commit tag = self.metadata_tag(metadata) # check if tag already exists in the ml-git repository tags = self._tag_exists(tag) if len(tags) > 0: log.error(output_messages[ 'ERROR_TAG_ALREADY_EXISTS_CONSIDER_USER_VERSION'] % tag, class_name=METADATA_CLASS_NAME) for t in tags: log.error(output_messages['ERROR_METADATA_MESSAGE'] % t) return None, None if commit_msg is not None and len(commit_msg) > 0: msg = commit_msg else: # generates a commit message msg = self.metadata_message(metadata) log.debug(output_messages['DEBUG_COMMIT_MESSAGE'] % msg, class_name=METADATA_CLASS_NAME) sha = self.commit(entity_sub_path, msg) self.tag_add(tag) return str(tag), str(sha)
def get_by_id(self, file_path, file_id): try: file_info = self._store.files().get(fileId=file_id).execute() except errors.HttpError as error: log.error('%s' % error, class_name=GDRIVE_STORE) return False if not file_info: log.error('[%s] not found.' % file_id, class_name=GDRIVE_STORE) return False file_path = os.path.join(file_path, file_info.get('name')) self.download_file(file_path, file_info) return True
def _full_metadata_path(self, spec_file): log.debug('Getting subpath from categories in specfile [%s]' % spec_file, class_name=METADATA_CLASS_NAME) metadata = yaml_load(spec_file) if metadata == {}: log.error('The entity name passed it\'s wrong. Please check again', class_name=METADATA_CLASS_NAME) return None, None, None categories_path = self.metadata_subpath(metadata) if categories_path is None: log.error('You must place at least one category in the entity .spec file', class_name=METADATA_CLASS_NAME) return None, None, None full_metadata_path = os.path.join(self.__path, categories_path) return full_metadata_path, categories_path, metadata
def _is_valid_hashpath(self, path, file): """ Checks if the file is placed in a valid directory following the structure created in the _get_hashpath method """ hashpath = self._get_hashpath(file) actual_fullpath = os.path.join(path, file) is_valid = hashpath.lower() == actual_fullpath.lower() if not is_valid: log.error( 'Chunk found in wrong directory. Expected [%s]. Found [%s]' % (hashpath, actual_fullpath), class_name=HASH_FS_CLASS_NAME) return is_valid
def remote_del(repo_type, global_conf=False): file = get_config_path(global_conf) conf = yaml_load(file) if repo_type in conf: git_url = conf[repo_type]['git'] if git_url is None or not len(conf[repo_type]['git']) > 0: log.error(output_messages['ERROR_REMOTE_UNCONFIGURED'] % repo_type, class_name=ADMIN_CLASS_NAME) else: log.info(output_messages['INFO_REMOVE_REMOTE'] % (git_url, repo_type), class_name=ADMIN_CLASS_NAME) conf[repo_type]['git'] = '' yaml_save(conf, file) else: log.error(output_messages['ERROR_ENTITY_NOT_FOUND'] % repo_type, class_name=ADMIN_CLASS_NAME)
def list_tags(self, spec, full_info=False): tags = [] try: repo = Repo(self.__path) r_tags = repo.tags if full_info else repo.git.tag( sort='creatordate').split('\n') for tag in r_tags: if f'__{spec}__' in str(tag): tags.append(tag) except Exception: log.error('Invalid ml-git repository!', class_name=METADATA_MANAGER_CLASS_NAME) return tags