def __init__(self, target_dremio, dremio_data, config): self._config = config self._dremio_env = target_dremio self._d = dremio_data self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._filter = DremioClonerFilter(config) self._utils = DremioClonerUtils(config)
def __init__(self, config_file_name): # Read configuration file if sys.version_info.major > 2: f_open = lambda filename: open(filename, "r", encoding='utf-8') else: f_open = lambda filename: open(filename, "r") f = f_open(config_file_name) self.cloner_conf_json = json.load(f)['dremio_cloner'] f.close() for element in self.cloner_conf_json: if 'command' in element: self._process_command(element) elif 'source' in element: self._process_source(element) elif 'target' in element: self._process_target(element) elif 'options' in element: self._process_options(element) logging.basicConfig(format=self.logging_format, level=self.logging_level, filename=self.logging_filename) self._logger = DremioClonerLogger(self.max_errors, self.logging_verbose) self._validate_configuration()
def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config)
def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._delimeter = self._config.report_csv_delimiter self._newline = self._config.report_csv_newline self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config)
def __init__(self, config_file_name): # Read configuration file f = open(config_file_name, "r") self.cloner_conf_json = json.load(f)['dremio_cloner'] f.close() for element in self.cloner_conf_json: if 'command' in element: self._process_command(element) elif 'source' in element: self._process_source(element) elif 'target' in element: self._process_target(element) elif 'options' in element: self._process_options(element) logging.basicConfig(format=self.logging_format, level=self.logging_level, filename=self.logging_filename) self._logger = DremioClonerLogger(self.max_errors, self.logging_verbose) self._validate_configuration()
class DremioWriter: # Dremio Cloner Config, Utils, ... _config = None _utils = None _logger = None _filter = None # Dremio Environment to write to _dremio_env = None # Dremio Data to write _d = None # VDS list grouped by hierarchy _vds_hierarchy = [] _hierarchy_depth = 0 _unresolved_vds = [] # Referenced Users and Groups in the target environment _target_dremio_users = [] _target_dremio_groups = [] # Resolved Datasets for Reflections _existing_reflections = list() # Dry run collections _dry_run_processed_vds_list = [] _dry_run_processed_pds_list = [] def __init__(self, target_dremio, dremio_data, config): self._config = config self._dremio_env = target_dremio self._d = dremio_data self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._filter = DremioClonerFilter(config) self._utils = DremioClonerUtils(config) def write_dremio_environment(self): self._retrieve_users_groups() if self._config.acl_transformation != {} and self._d.referenced_users == [] and self._d.referenced_groups == []: self._logger.warn("ACL Transformation has been defined while Referenced Users and Referenced Groups are not present in the Source Dremio Data.") if self._config.reflection_process_mode != 'skip': self._existing_reflections = self._dremio_env.list_reflections()['data'] if self._config.source_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping source processing due to configuration source.process_mode=skip.") else: for source in self._d.sources: self._write_source(source, self._config.source_process_mode, self._config.source_ignore_missing_acl_user, self._config.source_ignore_missing_acl_group) if self._config.pds_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping source PDS processing due to configuration source.pds.process_mode=skip.") else: for pds in self._d.pds_list: self._write_pds(pds, self._config.pds_process_mode, self._config.pds_ignore_missing_acl_user, self._config.pds_ignore_missing_acl_group) if self._config.space_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping space processing due to configuration space.process_mode=skip.") else: for space in self._d.spaces: self._write_space(space, self._config.space_process_mode, self._config.space_ignore_missing_acl_user, self._config.space_ignore_missing_acl_group) if self._config.folder_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping folder processing due to configuration folder.process_mode=skip.") else: for folder in self._d.folders: self._write_folder(folder, self._config.folder_process_mode, self._config.folder_ignore_missing_acl_user, self._config.folder_ignore_missing_acl_group) if self._config.vds_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping VDS processing due to configuration vds.process_mode=skip.") else: self._order_vds(0) self._write_vds_hierarchy() self._write_remainder_vds() if self._config.reflection_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping reflection processing due to configuration reflection.process_mode=skip.") else: for reflection in self._d.reflections: self._write_reflection(reflection, self._config.reflection_process_mode) if self._config.reflection_refresh_mode != 'refresh': self._logger.info("write_dremio_environment: Skipping reflection refresh due to configuration reflection.refresh_mode=skip.") else: for pds in self._d.pds_list: self._dremio_env.refresh_reflections_by_pds_path(self._utils.normalize_path(pds['path']), self._config.dry_run) if self._config.wiki_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping wiki processing due to configuration wiki.process_mode=skip.") else: for wiki in self._d.wikis: self._write_wiki(wiki, self._config.wiki_process_mode) if self._config.tag_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping tag processing due to configuration tag.process_mode=skip.") else: for tags in self._d.tags: self._write_tags(tags, self._config.tag_process_mode) def _write_space(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): if self._filter.match_space_filter(entity): self._logger.debug("_write_space: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_space: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _write_source(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): if self._filter.match_source_filter(entity): self._logger.debug("_write_source: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_source: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _write_folder(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Drop ACL for HOME folders if entity['path'][0][:1] == '@' and 'accessControlList' in entity: entity.pop("accessControlList") # Do not apply space.folder.filter to Home folders if entity['path'][0][:1] == '@' or self._filter.match_space_folder_filter(entity): self._logger.debug("_write_folder: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_folder: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _retrieve_users_groups(self): for user in self._d.referenced_users: target_user = self._dremio_env.get_user_by_name(user['name']) if target_user is not None: self._target_dremio_users.append(target_user) else: self._logger.error("_retrieve_users_groups: Unable to resolve user in target Dremio environment: " + str(user['name'])) for group in self._d.referenced_groups: target_group = self._dremio_env.get_group_by_name(group['name']) if target_group is not None: self._target_dremio_groups.append(target_group) else: self._logger.error("_retrieve_users_groups: Unable to resolve group in target Dremio environment: " + str(group['name'])) # Retrieve acl transformation target users and groups for item in self._config.acl_transformation: if 'user' in item['target']: user = self._dremio_env.get_user_by_name(item['target']['user']) if user is not None: # dont worry about dups self._target_dremio_users.append(user) else: self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION user in target Dremio environment: " + str(item['target']['user'])) if 'group' in item['target']: group = self._dremio_env.get_group_by_name(item['target']['group']) if group is not None: # dont worry about dups self._target_dremio_groups.append(group) else: self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION group in target Dremio environment: " + str(item['target']['group'])) def _write_vds_hierarchy(self): for level in range(0, self._hierarchy_depth): for item in self._vds_hierarchy: if item[0] == level: vds = item[1] if self._filter.match_vds_filter(vds): self._logger.debug("_write_vds_hierarchy: writing vds: " + self._utils.get_entity_desc(vds)) self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group) def _write_remainder_vds(self): if not self._d.vds_list and not self._unresolved_vds: return else: self._logger.info("_write_remainder_vds: Attempt processing VDSs that failed ordering.") # Attempt to process max_hierarchy_depth for h in range(1, self._config.vds_max_hierarchy_depth): # These are VDSs that have all dependencies validated but could not be placed in the hierarchy # Go with decreasing index so we can remove VDS from the list for i in range(len(self._d.vds_list) - 1, -1, -1): vds = self._d.vds_list[i] if self._filter.match_vds_filter(vds): self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds)) if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False): self._d.vds_list.remove(vds) else: self._d.vds_list.remove(vds) # Iterate through the remainder of unresolved VDS in the list # Go with decreasing index so we can remove VDS from the list for i in range(len(self._unresolved_vds) - 1, -1, -1): vds = self._unresolved_vds[i] if self._filter.match_vds_filter(vds): self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds)) if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False): self._unresolved_vds.remove(vds) else: self._unresolved_vds.remove(vds) if self._d.vds_list != [] or self._unresolved_vds != []: self._logger.warn('_write_remainder_vds: After attempting to process VDSs that failed ordering, the following VDSs still failed. Set log level to DEBUG and see prior error messages for more information.') for vds in self._d.vds_list: self._logger.error("Failed VDS: " + str(vds['path'])) for vds in self._unresolved_vds: self._logger.error("Failed VDS: " + str(vds['path'])) else: self._logger.warn("_write_remainder_vds: Finished processing VDSs that failed ordering. All VDSs have been successfuly processed.") def _write_user(self): if self._config.user_process_mode == 'skip': self._logger.info("_write_user: Skipping user processing due to configuration user.process_mode=skip.") return True self._logger.error("_write_user: Cannot create users. API is not implemented.") def _write_entity(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag, report_error = True): self._logger.debug("_write_entity: processing entity: " + self._utils.get_entity_desc(entity)) # Clean up the definition if 'id' in entity: entity.pop("id") if 'tag' in entity: entity.pop("tag") if 'children'in entity: entity.pop("children") if 'createdAt' in entity: entity.pop("createdAt") # Process ACL as needed if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Skip this entity due to ACL processing errors self._logger.info("_write_entity: Skipping entity due to ignore_missing_acl_user_flag, ignore_missing_acl_group_flag: " + self._utils.get_entity_desc(entity)) return False # Check if the entity already exists existing_entity = self._read_entity_definition(entity) # Ensure we have not received FOLDER instead of DATASET. See DX-16666 if existing_entity is not None and 'entityType' in entity and \ 'entityType' in existing_entity and entity['entityType'] != existing_entity['entityType']: existing_entity = None if existing_entity is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_entity: Skipping entity creation due to configuration process_mode=update_only. " + self._utils.get_entity_desc(entity)) return True # Reset version for proper concurrency if 'accessControlList' in entity: entity['accessControlList']['version'] = "0" if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Creating entity: " + self._utils.get_entity_desc(entity)) # For dry run, keep it in a seperate collection to suppress errors if self._utils.is_vds(entity): self._dry_run_processed_vds_list.append(entity) return False # Note for the CE target env, the ACL should have been popped out by _process_acl new_entity = self._dremio_env.create_catalog_entity(entity, self._config.dry_run) if new_entity is None: if report_error: self._logger.error("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity)) else: self._logger.debug("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity)) return False else: # Entity already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_entity: Found existing entity and process_mode is set to create_only. Skipping entity: " + self._utils.get_entity_desc(entity)) return True self._logger.debug("_write_entity: Overwriting entity definition as per process_mode configuration : " + self._utils.get_entity_desc(entity)) # Update entity definition with data from entity existing in the target environment entity['id'] = existing_entity['id'] entity['tag'] = existing_entity['tag'] # Tag from the entity existing in the target environment required for proper concurrency control # Update ACL version for proper concurrency control, but do not use ACL if not really needed as HOME folders are not allowed to have ACL if ('path' in entity and entity['path'][0][:1] == '@') or ('name' in entity and entity['name'][:1] == '@'): if 'accessControlList' in entity: entity.pop('accessControlList') else: # Note for the CE target env, the ACL should have been popped out by _process_acl if not self._config.target_ce: if 'accessControlList' not in entity: entity['accessControlList'] = {"version": "0"} # API changed behavior around version 4 and may not return version attribute for ACL. if 'accessControlList' in existing_entity and 'version' in existing_entity['accessControlList']: entity['accessControlList']['version'] = existing_entity['accessControlList']['version'] if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity)) return False updated_entity = self._dremio_env.update_catalog_entity(entity['id'], entity, self._config.dry_run, report_error) if updated_entity is None: if report_error: self._logger.error("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity)) else: self._logger.debug("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity)) return False return True def _write_pds(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_write_pds: processing entity: " + self._utils.get_entity_desc(entity)) if self._filter.match_pds_filter(entity): existing_entity = self._read_entity_definition(entity) if existing_entity is None: self._logger.error("_write_pds: Cannot find existing entity for PDS Entity. Either Folder, File, or PDS must exist prior to promoting or updating PDS. Source PDS: " + self._utils.get_entity_desc(entity)) return False # Check if PDS needs to be promoted first if 'type' not in existing_entity or existing_entity['type'] != 'PHYSICAL_DATASET': self._promote_pds(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) # Update PDS now self._logger.debug("_write_pds: writing pds: " + self._utils.get_entity_desc(entity)) self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: return None def _promote_pds(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_promote_pds: processing entity: " + self._utils.get_entity_desc(entity)) # Clean up the definition if 'id' in entity: entity.pop("id") if 'tag' in entity: entity.pop("tag") if 'children'in entity: entity.pop("children") if 'createdAt' in entity: entity.pop("createdAt") # Process ACL as needed if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Skip this entity due to ACL processing errors self._logger.error("_promote_pds: Skipping PDS due to an error in ACL processing: " + self._utils.get_entity_desc(entity)) return False # Read exisitng folder or file entity fs_entity = self._read_entity_definition(entity) if fs_entity is None: self._logger.error("_promote_pds: Skipping PDS. Cannot find folder or file for PDS Entity: " + self._utils.get_entity_desc(entity)) return False # Add Folder ID to PDS Entity entity['id'] = fs_entity['id'] if 'accessControlList' in entity: entity.pop('accessControlList') if self._config.dry_run: self._logger.warn("_promote_pds: Dry Run, NOT promoting pds: " + self._utils.get_entity_desc(entity)) return True self._logger.debug("_promote_pds: promoting pds: " + self._utils.get_entity_desc(entity)) new_pds_entity = self._dremio_env.promote_pds(entity, self._config.dry_run) if new_pds_entity is None: self._logger.error("_promote_pds: Error promoting PDS: " + self._utils.get_entity_desc(entity)) return False return True def _write_reflection(self, reflection, process_mode): self._logger.debug("_write_reflection: processing reflection: " + self._utils.get_entity_desc(reflection)) # Clean up the definition if 'id' in reflection: reflection.pop("id") if 'tag' in reflection: reflection.pop("tag") if 'createdAt' in reflection: reflection.pop("createdAt") if 'updatedAt' in reflection: reflection.pop("updatedAt") if 'currentSizeBytes' in reflection: reflection.pop("currentSizeBytes") if 'totalSizeBytes' in reflection: reflection.pop("totalSizeBytes") if 'status' in reflection: reflection.pop("status") reflection_path = reflection['path'] # Write Reflection reflection.pop("path") reflected_dataset = self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(reflection_path)) if reflected_dataset is None: self._logger.error("_write_reflection: Could not resolve dataset for " + self._utils.get_entity_desc(reflection)) return None # Match filters if requested if self._config.reflection_filter_mode == "apply_vds_pds_filter": if not self._filter.match_reflection_path(reflection_path, reflected_dataset): return False reflection['datasetId'] = reflected_dataset['id'] # Check if the reflection already exists existing_reflection = self._find_existing_reflection(reflection, reflected_dataset) if existing_reflection is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_reflection: Skipping reflection creation due to configuration reflection_process_mode. " + self._utils.get_entity_desc(reflection)) return None if self._config.dry_run: self._logger.warn("_write_reflection: Dry Run, NOT Creating reflection: " + self._utils.get_entity_desc(reflection)) return None new_reflection = self._dremio_env.create_reflection(reflection, self._config.dry_run) if new_reflection is None: self._logger.error("_write_reflection: could not create " + self._utils.get_entity_desc(reflection)) return None else: # Reflection already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_reflection: Found existing refleciton and reflection_process_mode is set to create_only. Skipping " + self._utils.get_entity_desc(reflection)) return None # make sure there are changes to update as it will invalidate existing reflection data if reflection['type'] == existing_reflection['type'] and \ reflection['name'] == existing_reflection['name'] and \ ('partitionDistributionStrategy' in reflection and reflection['partitionDistributionStrategy'] == existing_reflection['partitionDistributionStrategy']) and \ ('measureFields' in reflection and reflection['measureFields'] == existing_reflection['measureFields']) and \ ('dimensionFields' in reflection and reflection['dimensionFields'] == existing_reflection['dimensionFields']) and \ ('displayFields' in reflection and reflection['displayFields'] == existing_reflection['displayFields']) and \ ('sortFields' in reflection and reflection['sortFields'] == existing_reflection['sortFields']) and \ ('partitionFields' in reflection and reflection['partitionFields'] == existing_reflection['partitionFields']) and \ ('distributionFields' in reflection and reflection['distributionFields'] == existing_reflection['distributionFields']): # Nothing to do self._logger.debug("_write_reflection: No pending changes. Skipping " + self._utils.get_entity_desc(reflection)) return None if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Updating " + self._utils.get_entity_desc(reflection)) return False self._logger.debug("_write_reflection: Overwriting " + self._utils.get_entity_desc(reflection)) reflection['tag'] = existing_reflection['tag'] updated_reflection = self._dremio_env.update_reflection(existing_reflection['id'], reflection, self._config.dry_run) if updated_reflection is None: self._logger.error("_write_reflection: Error updating " + self._utils.get_entity_desc(reflection)) return False return True def _find_existing_reflection(self, reflection, dataset): for existing_reflection in self._existing_reflections: # Match reflections by name if reflection['name'] == existing_reflection['name']: existing_dataset = self._dremio_env.get_catalog_entity_by_id(existing_reflection['datasetId']) # Match reflections by respective dataset's path if existing_dataset is not None and existing_dataset['path'] == dataset['path']: return existing_reflection return None def _find_existing_dataset_by_path(self, path): return self._dremio_env.get_catalog_entity_by_path(path) # Searches for Users from entity's ACL in the target environment and either: # - removes the user from ACL if not found and ignore_missing_acl_user_flag is set # - returns False if if not found and ignore_missing_acl_user_flag is not set # - updates the ACL with userid from the new environment if User found there def _process_acl(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_process_acl: processing entity: " + self._utils.get_entity_desc(entity)) if 'accessControlList' not in entity: return True if self._config.target_ce: entity.pop('accessControlList') return True acl = entity['accessControlList'] transformed_acl = {"users": [], "groups": []} if 'version' in entity: acl.pop('version') if acl == {} or ('users' not in acl and 'groups' not in acl): pass else: if 'users' in acl: # Note, taking a copy of the list for proper removal of items for user_def in acl['users'][:]: new_acl_principal = self._find_matching_principal_for_userid(user_def['id'], user_def['permissions']) if new_acl_principal == "REMOVE": self._logger.info("_process_acl: Source User " + user_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity)) elif new_acl_principal is None: if ignore_missing_acl_user_flag: self._logger.warn("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. User is removed from ACL definition as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity)) else: self._logger.error("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity)) elif "user" in new_acl_principal: transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']}) elif "group" in new_acl_principal: transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']}) if 'groups' in acl: # Note, taking a copy of the list for proper removal of items for group_def in acl['groups'][:]: new_acl_principal = self._find_matching_principal_for_groupid(group_def['id'], group_def['permissions']) if new_acl_principal == "REMOVE": self._logger.info("_process_acl: Source Group " + group_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity)) elif new_acl_principal is None: if ignore_missing_acl_group_flag: self._logger.warn("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. Group is removed from ACL definition as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity)) else: # Flag is not set - return error status self._logger.error("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity)) elif "user" in new_acl_principal: transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']}) elif "group" in new_acl_principal: transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']}) entity['accessControlList'] = transformed_acl return True def _transform_permissions(self, source_permissions, acl_mapping): # if permission mapping not explicitely defined, use source permissions as is if 'permission-mapping' not in acl_mapping: return source_permissions permissions_mapping = acl_mapping['permission-mapping'] # READ is required for WRITE, so READ is always present in the list of permissions permissions = ["READ"] for permission in source_permissions: for mapping in permissions_mapping: # add only once if permission in mapping and mapping[permission] not in permissions: permissions.append(mapping[permission]) return permissions def _find_matching_principal_for_userid(self, userid, permissions): self._logger.debug("_find_matching_principal_for_userid: processing user_id: " + str(userid)) for user in self._d.referenced_users: if user['id'] == userid: transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions) if transformed_principal == "REMOVE": self._logger.info("_find_matching_principal_for_userid: Source User " + user['name'] + " [" + user['id'] + "] is mapped as NONE.") return "REMOVE" # If no tranformation is defined for this user elif transformed_principal is None: for target_user in self._target_dremio_users: if target_user['name'] == user['name']: return {"user":target_user['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal # If the username is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take username straight from target for user in self._target_dremio_users: if user['id'] == userid: transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions) if transformed_principal is None: return {"user": user['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal return None def _find_acl_transformation_by_username(self, username, permissions): for item in self._config.acl_transformation: if 'user' in item['source'] and item['source']['user'] == username: if "REMOVE" in item['target']: return "REMOVE" elif "user" in item['target']: for target_user in self._target_dremio_users: if target_user['name'] == item['target']['user']: new_permissions = self._transform_permissions(permissions, item) return {"user":target_user['id'],"permissions":new_permissions} elif "group" in item['target']: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group":target_group['id'],"permissions":new_permissions} # The transformation is defined for this user, however, the target principal is not in the target Dremio Environment return {"error": "user_transformation_found_but_target_principle_is_not_in_target_dremio_environment"} # If the username is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take username straight from target for item in self._config.acl_transformation: if 'user' in item['target'] and item['target']['user'] == username: for target_user in self._target_dremio_users: if target_user['name'] == username: new_permissions = self._transform_permissions(permissions, item) return {"user": target_user['id'], "permissions": new_permissions} if 'group' in item['target'] and item['target']['group'] == username: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group": target_group['id'], "permissions": new_permissions} return None def _find_matching_principal_for_groupid(self, groupid, permissions): self._logger.debug("_find_matching_groupid: processing: " + str(groupid)) for group in self._d.referenced_groups: if group['id'] == groupid: transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions) if transformed_principal == "REMOVE": self._logger.info("_find_matching_principal_for_groupid: Source Group " + group['name'] + " [" + group['id'] + "] is mapped as NONE.") return "REMOVE" # If no transformation is defined for this group elif transformed_principal is None: for target_group in self._target_dremio_groups: if target_group['name'] == group['name']: return {"group":target_group['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_groupid: error " + transformed_principal['error']) return None else: return transformed_principal # If the group name is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take group name straight from target for group in self._target_dremio_groups: if group['id'] == groupid: transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions) if transformed_principal is None: return {"user": group['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal return None def _find_acl_transformation_by_groupname(self, groupname, permissions): for item in self._config.acl_transformation: if 'group' in item['source'] and item['source']['group'] == groupname: if "REMOVE" in item['target']: return "REMOVE" elif "user" in item['target']: for target_user in self._target_dremio_users: if target_user['name'] == item['target']['user']: new_permissions = self._transform_permissions(permissions, item) return {"user":target_user['id'],"permissions":new_permissions} elif "group" in item['target']: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group":target_group['id'],"permissions":new_permissions} # The transformation is defined for this group, however, the target principal is not in the target Dremio Environment return {"error": "group_transformation_found_but_target_principle_is_not_in_target_dremio_environment"} # If the group name is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take group name straight from target for item in self._config.acl_transformation: if 'user' in item['target'] and item['target']['user'] == groupname: for target_user in self._target_dremio_users: if target_user['name'] == groupname: new_permissions = self._transform_permissions(permissions, item) return {"user": target_user['id'], "permissions": new_permissions} if 'group' in item['target'] and item['target']['group'] == groupname: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group": target_group['id'], "permissions": new_permissions} return None def _read_entity_definition(self, entity): self._logger.debug("_read_entity_definition: processing entity: " + self._utils.get_entity_desc(entity)) if 'name' in entity: return self._dremio_env.get_catalog_entity_by_path(entity['name']) elif 'path' in entity: return self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(entity['path'])) else: self._logger.error("_read_entity_definition: bad data: " + self._utils.get_entity_desc(entity)) return None # Process vds_list and save ordered list of VDSs into _vds_hierarchy. Recursive method. def _order_vds(self, processing_level=0): # Verify for the Max Hierarchy Depth if processing_level >= self._config.vds_max_hierarchy_depth: self._logger.debug("_order_vds: Finished processing with VDSs left to process:" + str(self._d.vds_list)) return any_vds_leveled = False # Iterate through the remainder VDS in the list # Go with decreasing index so we can remove VDS from the list for i in range(len(self._d.vds_list) - 1, -1, -1): vds = self._d.vds_list[i] self._logger.debug("_order_vds: processing vds " + self._utils.get_entity_desc(vds)) vds_hierarchy_level = processing_level any_dependency_unresolved = False sql_dependency_paths = self._get_vds_dependency_paths(vds) # Iterate through SQL dependencies to determine level of hierarchy for each dependency and the VDS for path in sql_dependency_paths: self._logger.debug("_order_vds: processing sql dependency " + path) # Validate the dependency against VDS and PDS sql_context = self._utils.get_sql_context(vds) dependency_vds = self._find_vds_by_path(self._utils.get_absolute_path(path, sql_context)) if dependency_vds is None: dependency_pds = self._find_pds_by_path(self._utils.get_absolute_path(path, sql_context)) if dependency_pds is None: # Dependency could not be resolved. self._logger.warn("_order_vds: giving up on ordering VDS '" + self._utils.normalize_path(vds['path']) + "'. Could not resolve dependency '" + self._utils.get_absolute_path(path, sql_context) + "' Will try to process without ordering.") # Move VDS into unresolved list self._unresolved_vds.append(vds) self._d.vds_list.remove(vds) # Mark as do-not-process any_dependency_unresolved = True break else: # The dependency has been resolved as PDS, continue to the next dependency continue else: # Dependency was found as VDS dependency_hierarchy_level = self._find_vds_level_in_hierarchy(dependency_vds['id']) if dependency_hierarchy_level is None: # Dependency has not been processed yet, push this VDS to the next processing level vds_hierarchy_level = None break # Find the highest level of hierarchy among dependencies elif vds_hierarchy_level < dependency_hierarchy_level + 1: vds_hierarchy_level = dependency_hierarchy_level + 1 if any_dependency_unresolved or vds_hierarchy_level is None: # Do not process this VDS at this recursion self._logger.debug("_order_vds: some dependencies cannot be validated for entity " + vds['id'] + " at processing level " + str(processing_level)) else: # Add the current VDS to the vds_hierarchy_level self._vds_hierarchy.append([vds_hierarchy_level, vds]) # Remove the current VDS from further processing self._d.vds_list.remove(vds) # Mark this hierarchy level as successful any_vds_leveled = True self._logger.debug("_order_vds: dependencies have been validated for entity " + vds['id'] + " for hierarchy level " + str(vds_hierarchy_level)) # Are we done yet with recursion if not any_vds_leveled or len(self._d.vds_list) == 0: self._hierarchy_depth = processing_level + 1 self._logger.debug("_order_vds: finished processing all VDS with hierarchy depth of :" + str(self._hierarchy_depth + 1)) return # Process the next Hierarchy Level recursively self._order_vds(processing_level + 1) def _get_vds_dependency_paths(self, vds): if self._is_source_ce() or not self._d.vds_parents: # CE does not support graph return parse_sql.tables_in_query(vds['sql']) else: for vds_entry in self._d.vds_parents: if vds_entry['path'] == vds['path']: return vds_entry['parents'] def _is_source_ce(self): for item in self._d.dremio_get_config: if 'source' in item: for param in item['source']: if 'is_community_edition' in param: return eval(param['is_community_edition']) return False def _find_vds_by_path(self, path): # First, try finding in the VDS list from the source file for vds in self._d.vds_list: if path == self._utils.normalize_path(vds['path']): return vds # For dry run, check processed vds if self._config.dry_run: for vds in self._dry_run_processed_vds_list: if path == self._utils.normalize_path(vds['path']): return vds # Finally, try finding in the target environment entity = self._dremio_env.get_catalog_entity_by_path(path) # Make sure we get VDS and not folder/file if entity is not None and self._utils.is_vds(entity): return entity return None def _find_pds_by_path(self, path): # First, try finding in the PDS list from the source file for pds in self._d.pds_list: if path == self._utils.normalize_path(pds['path']): return pds # For dry run, check processed pds if self._config.dry_run: for pds in self._dry_run_processed_pds_list: if path == self._utils.normalize_path(pds['path']): return pds # Finally, try finding in the target environment entity = self._dremio_env.get_catalog_entity_by_path(path) # Make sure we get promoted PDS and not folder/file if entity is not None and self._utils.is_pds(entity): return entity return None def _find_vds_level_in_hierarchy(self, vds_id): for item in self._vds_hierarchy: if item[1]['id'] == vds_id: return item[0] return None def get_errors_count(self): return self._logger.errors_encountered def _write_wiki(self, wiki, process_mode): self._logger.debug("_write_wiki: processing wiki: " + str(wiki)) new_wiki_text = wiki['text'] wiki_path = wiki['path'] # Check if the wiki already exists existing_wiki_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(wiki_path)) if existing_wiki_entity is None: self._logger.error("_write_wiki: Unable to resolve wiki's dataset for " + str(wiki)) return None existing_wiki = self._dremio_env.get_catalog_wiki(existing_wiki_entity['id']) if existing_wiki is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_wiki: Skipping wiki creation due to configuration wiki_process_mode. " + str(wiki)) return None if self._config.dry_run: self._logger.warn("_write_wiki: Dry Run, NOT Creating wiki: " + str(wiki)) return None new_wiki = {"text":new_wiki_text} new_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], new_wiki, self._config.dry_run) if new_wiki is None: self._logger.error("_write_wiki: could not create " + str(wiki)) return None else: # Wiki already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_wiki: Found existing wiki and wiki_process_mode is set to create_only. Skipping " + str(wiki)) return None # make sure there are changes to update as it will invalidate existing wiki data if new_wiki_text == existing_wiki['text']: # Nothing to do self._logger.debug("_write_wiki: No pending changes. Skipping " + str(wiki)) return None if self._config.dry_run: self._logger.warn("_write_wiki: Dry Run, NOT Updating " + str(wiki)) return False self._logger.debug("_write_wiki: Overwriting " + str(wiki)) existing_wiki['text'] = new_wiki_text updated_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], existing_wiki, self._config.dry_run) if updated_wiki is None: self._logger.error("_write_wiki: Error updating " + str(wiki)) return False return True def _write_tags(self, tags, process_mode): self._logger.debug("_write_tag: processing tags: " + str(tags)) new_tags = tags['tags'] tags_path = tags['path'] # Check if the tags already exist existing_tags_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(tags_path)) if existing_tags_entity is None: self._logger.error("_write_tags: Unable to resolve tag's dataset for " + str(tags)) return None existing_tags = self._dremio_env.get_catalog_tags(existing_tags_entity['id']) if existing_tags is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_tags: Skipping tags creation due to configuration tag_process_mode. " + str(tags)) return None if self._config.dry_run: self._logger.warn("_write_tags: Dry Run, NOT Creating tags: " + str(tags)) return None new_tags = {"tags":new_tags} new_tags = self._dremio_env.update_tag(existing_tags_entity['id'], new_tags, self._config.dry_run) if new_tags is None: self._logger.error("_write_tags: could not create " + str(tags)) return None else: # Tags already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_tags: Found existing tags and tag_process_mode is set to create_only. Skipping " + str(tags)) return None # make sure there are changes to update as it will invalidate existing tags data if new_tags == existing_tags['tags']: # Nothing to do self._logger.debug("_write_tags: No pending changes. Skipping " + str(tags)) return None if self._config.dry_run: self._logger.warn("tags: Dry Run, NOT Updating " + str(tags)) return False self._logger.debug("_write_tags: Overwriting " + str(tags)) existing_tags['tags'] = new_tags updated_tags = self._dremio_env.update_tag(existing_tags_entity['id'], existing_tags, self._config.dry_run) if updated_tags is None: self._logger.error("_write_tags: Error updating " + str(tags)) return False return True
class DremioClonerConfig(): # Dremio Utils _utils = None _logger = None CMD_GET = 'get' CMD_PUT = 'put' CMD_CASCADE_ACL = 'cascade-acl' CMD_DESCRIBE_JOB = 'describe-job' CMD_REPORT_ACL = 'report-acl' CMD_REPORT_REFLECTIONS = 'report-reflections' CMD_DELETE = 'delete-beta' # Config json code cloner_conf_json = None # Command to execute: put, get, cp, report-acl, cascade-acl command = None dry_run = True # Source Dremio Environment definition source_endpoint = None source_verify_ssl = True source_username = None source_password = None source_filename = None source_directory = None source_ce = False source_graph_support = False target_ce = False job_sql = None # Source Dremio Environment definition target_endpoint = None target_verify_ssl = True target_username = None target_password = None target_filename = None target_directory = None target_file_or_dir_overwrite = False target_type = None container_filename = "___container.json" dremio_conf_filename = "___dremio_cloner_conf.json" # Options max_errors = 9999 http_timeout = 10 # seconds # Logging options logging_level = logging.INFO logging_format = "%(levelname)s:%(asctime)s:%(message)s" logging_filename = None logging_verbose = False # Processing user_process_mode = None # Flag to process User: process, skip group_process_mode = None # Flag to process Group: process, skip space_filter = None # Filter for Space entity type space_filter_names = [] # List of Spaces to process if not empty space_exclude_filter = None # Exclusion Filter for Space entity type space_cascade_acl_origin_override_object = None # An ACL from this object will be utilized instead of the Space ACL as an ACL to set inside all Folders and VDSs in the Space space_folder_filter = None # Filter for Space Folder entity type space_folder_exclude_filter = None # Exclusion Filter for Space Folder entity type space_folder_cascade_acl_origin_filter = None # Filter for folders that will be used as ACL origins if specified space_process_mode = None # Flag to process Space: process, skip, create_only, update_only, create_overwrite space_ignore_missing_acl_user = False # Flag to write a Space if an ACL user is missing in the target Dremio environment space_ignore_missing_acl_group = False # Flag to write a Space if an ACL group is missing in the target Dremio environment source_filter = None # Filter for Source entity type source_filter_names = [] # List of Sources to process if not empty source_filter_types = [] # List of Source Types to process if not empty source_exclude_filter = None # Exclusion Filter for Source entity type source_cascade_acl_origin_override_object = None # An ACL from this object will be utilized instead of the Source ACL as an ACL to set inside all PDS in the Source source_folder_filter = None # Filter for Source Folder entity type source_folder_exclude_filter = None # Exclusion Filter for Source Folder entity type source_process_mode = None # Flag to process Sources: process, skip, create_only, update_only, create_overwrite source_ignore_missing_acl_user = False # Flag to write a Source if an ACL user is missing in the target Dremio environment source_ignore_missing_acl_group = False # Flag to write a Source if an ACL group is missing in the target Dremio environment source_retry_timedout = False # Flag to retry Sources that timed out folder_process_mode = None # Flag to process Folder: process, skip, create_only, update_only, create_overwrite folder_ignore_missing_acl_user = False # Flag to write a Folder if an ACL user is missing in the target Dremio environment folder_ignore_missing_acl_group = False # Flag to write a Folder if an ACL group is missing in the target Dremio environment pds_list_useapi = False # Using API for listing PDS may cause issues when the source is not available at the runtime pds_filter = None # Filter for PDS pds_exclude_filter = None # Exclusion Filter for PDS pds_process_mode = None # Flag to process Source PDS: process, skip, promote pds_ignore_missing_acl_user = False # Flag to write a Source PDS if an ACL user is missing in the target Dremio environment pds_ignore_missing_acl_group = False # Flag to write a Source PDS if an ACL group is missing in the target Dremio environment vds_filter = None # Filter for VDS vds_filter_tag = None # Filter for VDS vds_exclude_filter = None # Exclusion Filter for VDS vds_process_mode = None # Flag to process VDS: process, skip, create_only, update_only, create_overwrite vds_dependencies_process_mode = 'ignore' # Flag to process VDS dependencies (VDS and PDS): ignore, get vds_ignore_missing_acl_user = False # Flag to write a VDS if an ACL user is missing in the target Dremio environment vds_ignore_missing_acl_group = False # Flag to write a VDS if an ACL group is missing in the target Dremio environment vds_max_hierarchy_depth = 100 # The max hierarchy depth to process reflection_process_mode = None # Flag to process reflection: process, skip, create_only, update_only, create_overwrite reflection_filter_mode = None # Flag to filter reflection: apply_vds_pds_filter reflection_refresh_mode = 'skip' # Flag to refresh reflections: refresh, skip wlm_queue_process_mode = 'process' # Flag to process WLM Queues: process, skip wlm_rule_process_mode = 'process' # Flag to process WLM Rules: process, skip wiki_process_mode = 'process' # Flag to process Wikis: process, skip, create_only, update_only, create_overwrite tag_process_mode = 'process' # Flag to process Tags: process, skip home_process_mode = 'process' # Flag to process Homes: process, skip vote_process_mode = 'process' # Flag to process Votes: process, skip acl_transformation = {} # Contains all ACL tranformation definitions # Delete VDS List delete_vds = [] # List of VDS to delete from the target environment delete_folders = [ ] # List of Folders to delete from the target environment # Report options report_csv_delimiter = "\t" report_csv_newline = "\n" # Misc options # Compiled filters _space_filter_re = None _space_exclude_filter_re = None _space_folder_filter_re = None _space_folder_exclude_filter_re = None _space_folder_cascade_acl_origin_filter_re = None _source_filter_re = None _source_exluce_filter_re = None _source_folder_filter_re = None _source_folder_exclude_filter_re = None _pds_filter_re = None _pds_exclude_filter_re = None _vds_filter_re = None _vds_exclude_filter_re = None def __init__(self, config_file_name): # Read configuration file f = open(config_file_name, "r", encoding="utf-8") self.cloner_conf_json = json.load(f)['dremio_cloner'] f.close() for element in self.cloner_conf_json: if 'command' in element: self._process_command(element) elif 'source' in element: self._process_source(element) elif 'target' in element: self._process_target(element) elif 'options' in element: self._process_options(element) logging.basicConfig(format=self.logging_format, level=self.logging_level, filename=self.logging_filename) self._logger = DremioClonerLogger(self.max_errors, self.logging_verbose) self._validate_configuration() def _process_command(self, json_conf): self.command = json_conf['command'] def _process_target(self, json_conf): for item in json_conf['target']: if 'endpoint' in item: self.target_endpoint = item['endpoint'] elif 'username' in item: self.target_username = item['username'] elif 'password' in item: self.target_password = item['password'] elif 'filename' in item: self.target_filename = item['filename'] elif 'directory' in item: self.target_directory = item['directory'] elif 'overwrite' in item: self.target_file_or_dir_overwrite = self._bool( item, 'overwrite') elif 'verify_ssl' in item: self.target_verify_ssl = self._bool(item, 'verify_ssl') elif 'is_community_edition' in item: self.target_ce = self._bool(item, 'is_community_edition') elif 'target.type' in item: self.target_type = self._str(item, 'target.type') def _process_source(self, json_conf): for item in json_conf['source']: if 'endpoint' in item: self.source_endpoint = item['endpoint'] elif 'username' in item: self.source_username = item['username'] elif 'password' in item: self.source_password = item['password'] elif 'filename' in item: self.source_filename = item['filename'] elif 'directory' in item: self.source_directory = item['directory'] elif 'verify_ssl' in item: self.source_verify_ssl = self._bool(item, 'verify_ssl') elif 'is_community_edition' in item: self.source_ce = self._bool(item, 'is_community_edition') elif 'graph_api_support' in item: self.source_graph_support = self._bool(item, 'graph_api_support') elif 'job-sql' in item: self.job_sql = self._str(item, 'job-sql') def _process_options(self, json_conf): for item in json_conf['options']: if 'dry_run' in item: self.dry_run = self._bool(item, 'dry_run') elif 'max_errors' in item: self.max_errors = self._eval(item, 'max_errors') elif 'logging.level' in item: self.logging_level = self._eval(item, 'logging.level') elif 'logging.format' in item: self.logging_format = self._str(item, 'logging.format') elif 'logging.filename' in item: self.logging_filename = self._str(item, 'logging.filename') elif 'logging.verbose' in item: self.logging_verbose = self._bool(item, 'logging.verbose') elif 'http_timeout' in item: self.http_timeout = self._int(item, 'http_timeout') elif 'user.process_mode' in item: self.user_process_mode = self._str(item, 'user.process_mode') elif 'group.process_mode' in item: self.group_process_mode = self._str(item, 'group.process_mode') elif 'space.process_mode' in item: self.space_process_mode = self._str(item, 'space.process_mode') elif 'space.filter' in item: self.space_filter = self._str(item, 'space.filter') self._space_filter_re = self._compile_pattern( self.space_filter) elif 'space.filter.names' in item: self.space_filter_names = self._array(item, 'space.filter.names') elif 'space.exclude.filter' in item: self.space_exclude_filter = self._str(item, 'space.exclude.filter') self._space_exclude_filter_re = self._compile_pattern( self.space_exclude_filter) elif 'space.cascade-acl-origin.override-object' in item: self.space_cascade_acl_origin_override_object = self._str( item, 'space.cascade-acl-origin.override-object') elif 'space.folder.filter' in item: self.space_folder_filter = self._str(item, 'space.folder.filter') self._space_folder_filter_re = self._compile_pattern( self.space_folder_filter) elif 'space.folder.exclude.filter' in item: self.space_folder_exclude_filter = self._str( item, 'space.folder.exclude.filter') self._space_folder_exclude_filter_re = self._compile_pattern( self.space_folder_exclude_filter) elif 'space.folder.cascade-acl-origin.filter' in item: self.space_folder_cascade_acl_origin_filter = self._str( item, 'space.folder.cascade-acl-origin.filter') self._space_folder_cascade_acl_origin_filter_re = self._compile_pattern( self.space_folder_cascade_acl_origin_filter) elif 'space.ignore_missing_acl_user' in item: self.space_ignore_missing_acl_user = self._bool( item, 'space.ignore_missing_acl_user') elif 'space.ignore_missing_acl_group' in item: self.space_ignore_missing_acl_group = self._bool( item, 'space.ignore_missing_acl_group') elif 'source.process_mode' in item: self.source_process_mode = self._str(item, 'source.process_mode') elif 'source.filter.names' in item: self.source_filter_names = self._array(item, 'source.filter.names') elif 'source.filter.types' in item: self.source_filter_types = self._array(item, 'source.filter.types') elif 'source.filter' in item: self.source_filter = self._str(item, 'source.filter') self._source_filter_re = self._compile_pattern( self.source_filter) elif 'source.exclude.filter' in item: self.source_exclude_filter = self._str( item, 'source.exclude.filter') self._source_exclude_filter_re = self._compile_pattern( self.source_exclude_filter) elif 'source.folder.filter' in item: self.source_folder_filter = self._str(item, 'source.folder.filter') self._source_folder_filter_re = self._compile_pattern( self.source_folder_filter) elif 'source.cascade-acl-origin.override-object' in item: self.source_cascade_acl_origin_override_object = self._str( item, 'source.cascade-acl-origin.override-object') elif 'source.folder.exclude.filter' in item: self.source_folder_exclude_filter = self._str( item, 'source.folder.exclude.filter') self._source_folder_exclude_filter_re = self._compile_pattern( self.source_folder_exclude_filter) elif 'source.ignore_missing_acl_user' in item: self.source_ignore_missing_acl_user = self._bool( item, 'source.ignore_missing_acl_user') elif 'source.ignore_missing_acl_group' in item: self.source_ignore_missing_acl_group = self._bool( item, 'source.ignore_missing_acl_group') elif 'source.retry_timedout' in item: self.source_retry_timedout = self._bool( item, 'source.retry_timedout') elif 'folder.process_mode' in item: self.folder_process_mode = self._str(item, 'folder.process_mode') elif 'folder.ignore_missing_acl_user' in item: self.folder_ignore_missing_acl_user = self._bool( item, 'folder.ignore_missing_acl_user') elif 'folder.ignore_missing_acl_group' in item: self.folder_ignore_missing_acl_group = self._bool( item, 'folder.ignore_missing_acl_group') elif 'pds.process_mode' in item: self.pds_process_mode = self._str(item, 'pds.process_mode') elif 'pds.list.useapi' in item: self.pds_list_useapi = self._bool(item, 'pds.list.useapi') elif 'pds.filter' in item: self.pds_filter = self._str(item, 'pds.filter') self._pds_filter_re = self._compile_pattern(self.pds_filter) elif 'pds.exclude.filter' in item: self.pds_exclude_filter = self._str(item, 'pds.exclude.filter') self._pds_exclude_filter_re = self._compile_pattern( self.pds_exclude_filter) elif 'pds.ignore_missing_acl_user' in item: self.pds_ignore_missing_acl_user = self._bool( item, 'pds.ignore_missing_acl_user') elif 'pds.ignore_missing_acl_group' in item: self.pds_ignore_missing_acl_group = self._bool( item, 'pds.ignore_missing_acl_group') elif 'vds.process_mode' in item: self.vds_process_mode = self._str(item, 'vds.process_mode') elif 'vds.dependencies.process_mode' in item: self.vds_dependencies_process_mode = self._str( item, 'vds.dependencies.process_mode') elif 'vds.filter' in item: self.vds_filter = self._str(item, 'vds.filter') self._vds_filter_re = self._compile_pattern(self.vds_filter) elif 'vds.filter.tag' in item: self.vds_filter_tag = self._str(item, 'vds.filter.tag') elif 'vds.exclude.filter' in item: self.vds_exclude_filter = self._str(item, 'vds.exclude.filter') self._vds_exclude_filter_re = self._compile_pattern( self.vds_exclude_filter) elif 'vds.ignore_missing_acl_user' in item: self.vds_ignore_missing_acl_user = self._bool( item, 'vds.ignore_missing_acl_user') elif 'vds.ignore_missing_acl_group' in item: self.vds_ignore_missing_acl_group = self._bool( item, 'vds.ignore_missing_acl_group') elif 'vds.max_hierarchy_depth' in item: self.vds_max_hierarchy_depth = self._bool( item, 'vds.max_hierarchy_depth') # Reflection options elif 'reflection.process_mode' in item: self.reflection_process_mode = self._str( item, 'reflection.process_mode') elif 'reflection.filter_mode' in item: self.reflection_filter_mode = self._str( item, 'reflection.filter_mode') elif 'pds.reflection_refresh_mode' in item: self.reflection_refresh_mode = self._str( item, 'pds.reflection_refresh_mode') # Report Options elif 'report.csv.delimiter' in item: self.report_csv_delimiter = self._str(item, 'report.csv.delimiter') elif 'report.csv.newline' in item: self.report_csv_newline = self._str(item, 'report.csv.newline') # Misc options elif 'wlm.queue.process_mode' in item: self.wlm_queue_process_mode = self._str( item, 'wlm.queue.process_mode') elif 'wlm.rule.process_mode' in item: self.wlm_rule_process_mode = self._str( item, 'wlm.rule.process_mode') elif 'wiki.process_mode' in item: self.wiki_process_mode = self._str(item, 'wiki.process_mode') elif 'tag.process_mode' in item: self.tag_process_mode = self._str(item, 'tag.process_mode') elif 'home.process_mode' in item: self.home_process_mode = self._str(item, 'home.process_mode') elif 'vote.process_mode' in item: self.vote_process_mode = self._str(item, 'vote.process_mode') elif 'transformation' in item: acl_transformation_filename = self._str( item['transformation']['acl'], 'file') f = open(acl_transformation_filename, "r") self.acl_transformation = json.load(f)['acl-transformation'] f.close() elif 'vds.delete_list' in item: self.delete_vds = self._str_array(item, 'vds.delete_list') elif 'folder.delete_list' in item: self.delete_folders = self._str_array(item, 'folder.delete_list') def _validate_configuration(self): if (self.command is None): self._logger.fatal("missing 'command' entry.") elif self.command == self.CMD_GET and ( self.source_endpoint is None or self.source_username is None or self.source_password is None or (self.target_filename is None and self.target_directory is None)): self._logger.fatal("Invalid configuration for command 'get'.") elif self.command == self.CMD_PUT and ( (self.source_filename is None and self.source_directory is None) or self.target_endpoint is None or self.target_username is None or self.target_password is None): self._logger.fatal("Invalid configuration for command 'get'.") elif self.command == self.CMD_REPORT_ACL and ( self.source_endpoint is None or self.source_username is None or self.source_password is None or self.target_filename is None): self._logger.fatal( "Invalid configuration for command 'report-acl'.") if (self.command == self.CMD_PUT and (self.space_process_mode is None or (self.space_process_mode != 'skip' and self.space_process_mode != 'update_only' and self.space_process_mode != 'create_only' and self.space_process_mode != 'create_overwrite'))): self._logger.fatal("Invalid configuration for space.process_mode.") if (self.command == self.CMD_PUT and (self.source_process_mode is None or (self.source_process_mode != 'skip' and self.source_process_mode != 'update_only' and self.source_process_mode != 'create_only' and self.source_process_mode != 'create_overwrite'))): self._logger.fatal( "Invalid configuration for source.process_mode.") if (self.command == self.CMD_PUT and (self.pds_process_mode is None or (self.pds_process_mode != 'skip' and self.pds_process_mode != 'promote'))): self._logger.fatal("Invalid configuration for pds.process_mode.") if (self.command == self.CMD_PUT and (self.vds_process_mode is None or (self.vds_process_mode != 'skip' and self.vds_process_mode != 'update_only' and self.vds_process_mode != 'create_only' and self.vds_process_mode != 'create_overwrite'))): self._logger.fatal("Invalid configuration for vds.process_mode.") # Make sure we do not overwrite JSON environment file if (self.command == self.CMD_GET and self.target_filename is not None and not self.target_file_or_dir_overwrite and os.path.isfile(self.target_filename)): self._logger.fatal("File " + str(self.target_filename) + " already exists. Cannot overwrite.") if (self.command == self.CMD_GET and self.target_directory is not None and not self.target_file_or_dir_overwrite and os.path.isdir(self.target_directory)): self._logger.fatal("Directory " + str(self.target_directory) + " already exists. Cannot overwrite.") if (self.command == self.CMD_REPORT_ACL and os.path.isfile(self.target_filename)): self._logger.fatal("File " + str(self.target_filename) + " already exists. Cannot overwrite.") def _bool(self, conf, param_name): if (param_name in conf): try: return eval(conf[param_name].title()) except NameError: self._logger.fatal("Invalid boolean value for parameter " + param_name) else: return None def _array(self, conf, param_name): if (param_name in conf): try: return conf[param_name] except: self._logger.fatal("Invalid array value for parameter " + param_name) else: return None def _int(self, conf, param_name): if (param_name in conf): try: return int(conf[param_name]) except: self._logger.fatal("Invalid integer value for parameter " + param_name) else: return None def _str(self, conf, param_name): if (param_name in conf and not conf[param_name] == ""): return conf[param_name] return None def _str_array(self, conf, param_name): if (param_name in conf and not conf[param_name] == ""): return conf[param_name] return None def _eval(self, conf, param_name): if (param_name in conf): try: return eval(conf[param_name]) except: self._logger.fatal("Invalid value for parameter " + param_name) else: return None def _compile_pattern(self, pattern): if pattern is None: return None return re.compile(fnmatch.translate(pattern))
class DremioReader: # Dremio Cloner Configuration, Utils, ... _config = None _utils = None _logger = None _filter = None # Dremio object pointing to the source Dremio environment _dremio_env = None # DremioData object containing data from Dremio source environment _d = DremioData() # Current top-level hierarchy context: Home, Space, Source _top_level_hierarchy_context = None def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) # Read all data from the source Dremio environemnt # Return DremioData def read_dremio_environment(self): self._read_catalog() if not self._config.pds_list_useapi and self._filter.is_pds_in_scope(): self._read_all_pds() self._read_reflections() self._read_rules() self._read_queues() self._read_votes() # Make sure that all VDS dependencies included as per configuration self._process_vds_dependencies() return self._d def _read_all_pds(self): if self._config.pds_list_useapi or not self._filter.is_pds_in_scope(): self._logger.info( "_read_all_pds: skipping PDS reading as per pds.filter configuration." ) else: pds_list = self._dremio_env.list_pds( self._d.sources, self._config.source_folder_filter, self._config.source_folder_exclude_filter, self._config.pds_filter, self._config.pds_exclude_filter, pds_error_list=self._d.pds_error_list) for pds in pds_list: if self._filter.match_pds_filter(pds): self._d.pds_list.append(pds) # Read Dremio catalog from source environment recursively going to containers and their children objects def _read_catalog(self): containers = self._dremio_env.list_catalog()['data'] for container in containers: self._logger.debug("_read_catalog: processing container " + self._utils.get_entity_desc(container)) self._process_container(container) # Identify a container and delegate processing def _process_container(self, container): self._logger.debug("_process_container: " + self._utils.get_entity_desc(container)) if container['containerType'] == "HOME": self._read_home(container) elif container['containerType'] == "SPACE": self._read_space(container) elif container['containerType'] == "SOURCE": self._read_source(container) else: self._logger.fatal("_process_container: unexpected entity type " + self._utils.get_entity_desc(container)) def _read_home(self, container): self._logger.debug("_read_home: processing container: " + self._utils.get_entity_desc(container)) if self._config.home_process_mode == 'process': self._top_level_hierarchy_context = "HOME" self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.info("_read_home: " + self._utils.get_entity_desc(entity)) self._d.homes.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_home: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug("_read_home: skipping due to job configuration") def _read_space(self, container): self._logger.debug("_read_space: processing container: " + self._utils.get_entity_desc(container)) self._top_level_hierarchy_context = "SPACE" if self._filter.match_space_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.debug("_read_space: " + self._utils.get_entity_desc(container)) self._d.spaces.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_space: error reading entity for container: " + self._utils.get_entity_desc(container)) def _read_source(self, container): self._logger.debug("_read_source: processing container: " + self._utils.get_entity_desc(container)) if self._config.source_process_mode == 'process' or ( self._config.pds_process_mode == 'process' and self._config.pds_list_useapi): self._top_level_hierarchy_context = "SOURCE" if self._filter.match_source_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: # Re-validate the filter with entity since there is more details in entity if self._filter.match_source_filter(entity): self._logger.debug("_read_source: " + self._utils.get_entity_desc(entity)) self._d.sources.append(entity) self._read_acl(entity) self._read_wiki(entity) # Depending on the useapi flag, PDSs can be collected via INFORMATION_SCHEMA. See also DX16597 if self._config.pds_list_useapi: self._read_source_children(entity) else: self._logger.error( "_read_source: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug( "_read_source: skipping due to job configuration") def _read_space_folder(self, folder): self._logger.debug("_read_space_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context not in ["SPACE", "HOME"]: return entity = self._get_entity_definition_by_id(folder) if entity is None: self._logger.error( "_read_space_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) return if self._top_level_hierarchy_context == "HOME" or self._filter.match_space_folder_filter( folder): self._logger.debug("_read_space_folder: " + self._utils.get_entity_desc(folder)) self._d.folders.append(entity) self._read_acl(entity) self._read_wiki(entity) # Validate all parent folders in the path have been saved already folder_path = entity['path'] for i in range(1, len(folder_path) - 1): folderSaved = False for item in self._d.folders: if item['path'][-1] == folder_path[i]: folderSaved = True if not folderSaved: parent_entity = self._get_entity_definition_by_path( folder_path[0:i + 1]) self._d.folders.append(parent_entity) self._read_space_children(entity) def _read_space_children(self, parent_entity): self._logger.debug("_read_space_children: processing parent_entity: " + self._utils.get_entity_desc(parent_entity)) if 'entityType' not in parent_entity: self._logger.error( "_read_space_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_space_folder(child) else: self._logger.error( "_read_space_children: not supported entity type " + child['type']) def _read_source_folder(self, folder): self._logger.debug("_read_source_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context == "SOURCE" and self._filter.match_source_folder_filter( folder): entity = self._get_entity_definition_by_id(folder) if entity is not None: self._logger.debug("_read_source_folder: " + self._utils.get_entity_desc(folder)) self._read_source_children(entity) else: self._logger.error( "_read_source_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) def _read_source_children(self, parent_entity): self._logger.debug( "_read_source_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") if 'entityType' not in parent_entity: self._logger.error( "_read_source_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_source_folder(child) else: self._logger.error( "_read_source_children: not supported entity type " + child['type']) def _read_dataset(self, dataset): self._logger.debug("_read_dataset: processing dataset: " + self._utils.get_entity_desc(dataset)) entity = self._get_entity_definition_by_id(dataset) if entity is not None: self._logger.debug("_read_dataset: " + dataset['datasetType'] + " : " + self._utils.get_entity_desc(dataset)) if dataset['datasetType'] == "PROMOTED" or dataset[ 'datasetType'] == "DIRECT": self._d.pds_list.append(entity) elif dataset['datasetType'] == "VIRTUAL": tags = self._dremio_env.get_catalog_tags(entity['id']) if self._filter.match_vds_filter(dataset, tags=tags): self._d.vds_list.append(entity) else: self._logger.error("_read_dataset: Unexpected dataset type " + dataset['datasetType'] + " for " + self._utils.get_entity_desc(dataset) + ".") self._read_acl(entity) self._read_wiki(entity) self._read_tags(entity) def _read_file(self, file_name): # do nothing return def _read_reflections(self): self._logger.debug("_read_reflections: starting") if self._config.reflection_process_mode == 'process' and not self._config.source_ce: reflections = self._dremio_env.list_reflections()['data'] for reflection in reflections: reflection_dataset = self._dremio_env.get_catalog_entity_by_id( reflection['datasetId']) if reflection_dataset is None: self._logger.debug( "_read_reflections: error processing reflection, cannot get path for dataset: " + reflection['datasetId']) continue reflection_path = reflection_dataset['path'] self._logger.debug( "_read_reflections: processing reflection " + reflection['datasetId'] + " path: " + str(reflection_path)) reflection["path"] = reflection_path self._d.reflections.append(reflection) # self._read_acl(reflection) # self._read_wiki(reflection) else: self._logger.debug( "_read_reflections: skipping reflections processing as per job configuration" ) # Note, tags are only available for datasets def _read_tags(self, entity): self._logger.debug("_read_tags: for entity " + self._utils.get_entity_desc(entity)) if self._config.tag_process_mode == 'process': tag = self._dremio_env.get_catalog_tags(entity['id']) if tag is not None: tag['entity_id'] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source': tag['path'] = [entity['name']] else: tag['path'] = entity['path'] if tag not in self._d.tags: self._d.tags.append(tag) else: self._logger.debug( "_read_tags: skipping tags processing as per job configuration" ) def _read_wiki(self, entity): self._logger.debug("_read_wiki: for entity " + self._utils.get_entity_desc(entity)) if self._config.wiki_process_mode == 'process': wiki = self._dremio_env.get_catalog_wiki(entity['id']) if wiki is not None: wiki["entity_id"] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source' or entity[ 'entityType'] == 'home': wiki['path'] = [entity['name']] else: wiki['path'] = entity['path'] if wiki not in self._d.wikis: self._d.wikis.append(wiki) else: self._logger.debug( "_read_wiki: skipping wiki processing as per job configuration" ) def _read_acl(self, entity): self._logger.debug("_read_acl: for entity " + self._utils.get_entity_desc(entity)) if 'accessControlList' in entity: acl = entity['accessControlList'] if 'users' in acl: for user in acl['users']: user_entity = self._dremio_env.get_user(user['id']) if user_entity is not None: if user_entity not in self._d.referenced_users: self._d.referenced_users.append(user_entity) if 'groups' in acl: for group in acl['groups']: group_entity = self._dremio_env.get_group(group['id']) if group_entity is not None: if group_entity not in self._d.referenced_groups: self._d.referenced_groups.append(group_entity) def _process_vds_dependencies(self): if self._config.vds_dependencies_process_mode == 'get': for vds in self._d.vds_list: self._discover_dependencies(vds) for vds in self._d.vds_list: self._populate_dependencies_graph(vds) # Discovers dependencies for the passed dataset and adds them to the self._d.vds_list def _discover_dependencies(self, dataset): self._logger.debug("_discover_dependencies: processing dataset: " + self._utils.get_entity_desc(dataset)) if dataset is not None: if 'type' not in dataset: self._logger.error( "_discover_dependencies: Expected Dataset Entity but got: " + self._utils.get_entity_desc(dataset)) return if dataset['type'] == 'PHYSICAL_DATASET': if dataset not in self._d.pds_list: self._d.pds_list.append(dataset) return elif dataset['type'] == 'VIRTUAL_DATASET': if dataset not in self._d.vds_list: self._d.vds_list.append(dataset) # Process VDS dependencies sql_dependency_paths = self._get_vds_dependency_paths(dataset) for dependency_path in sql_dependency_paths: dependency_path = self._utils.get_absolute_path( dependency_path, self._utils.get_sql_context(dataset)) entity = self._find_entity(dependency_path) if entity is not None: # Entity has already been read return dependency_dataset = self._dremio_env.get_catalog_entity_by_path( dependency_path) if dependency_dataset is None: self._logger.warn( "_discover_dependencies: unable to resolve dataset likely due to datasource availability: " + dependency_path) else: self._discover_dependencies(dependency_dataset) else: self._logger.error( "_discover_dependencies: Unknown Entity Type: " + dataset['type']) else: self._logger.error( "_discover_dependencies: Could not resolve dependency: None") def _populate_dependencies_graph(self, vds): self._logger.debug("_populate_dependencies_graph: processing vds: " + self._utils.get_entity_desc(vds)) # For some broken VDSs, vds_parent_list = self._get_vds_dependency_paths(vds) vds_parent_json = { 'id': vds['id'], 'path': vds['path'], 'parents': vds_parent_list } if not self._config.source_ce and self._config.source_graph_support: self._d.vds_parents.append(vds_parent_json) def _get_vds_dependency_paths(self, vds): self._logger.debug("_get_vds_dependency_paths: processing vds: " + self._utils.get_entity_desc(vds)) if self._config.source_ce or not self._config.source_graph_support: return parse_sql.tables_in_query(vds['sql']) else: graph = self._dremio_env.get_catalog_entity_graph_by_id(vds['id']) if graph is None: self._logger.warn( "Could not receive Graph via API. Try to set graph_api_support to False in the job configuration." ) return parse_sql.tables_in_query(vds['sql']) vds_parent_list = [] for parent in graph['parents']: vds_parent_list.append( self._utils.normalize_path(parent['path'])) return vds_parent_list def _find_entity(self, path): self._logger.debug("_find_entity: processing path: " + str(path)) for vds in self._d.vds_list: if self._utils.normalize_path(vds['path']) == path: return vds for pds in self._d.pds_list: if self._utils.normalize_path(pds['path']) == path: return pds # Helper method, used by most read* methods def _get_entity_definition_by_id(self, src): self._logger.debug("_get_entity_definition_by_id: processing src: " + self._utils.get_entity_desc(src)) if 'id' not in src: self._logger.error( "_read_entity_definition: bad data, skipping entity: " + self._utils.get_entity_desc(src)) return None else: entity = self._dremio_env.get_catalog_entity_by_id(src['id']) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for id: " + src['id']) return entity def _get_entity_definition_by_path(self, path): self._logger.debug( "_get_entity_definition_by_path: processing path: " + str(path)) path = self._utils.normalize_path(path) entity = self._dremio_env.get_catalog_entity_by_path(path) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for path: " + str(path)) return entity def _read_queues(self): self._logger.debug("read_queues: started") if self._config.wlm_queue_process_mode == 'process' and not self._config.source_ce: self._d.queues = self._dremio_env.list_queues()['data'] else: self._logger.debug( "_read_queues: skipping as per job configuration") def _read_rules(self): self._logger.debug("read_rules: started") if self._config.wlm_rule_process_mode == 'process' and not self._config.source_ce: self._d.rules = self._dremio_env.list_rules()['rules'] else: self._logger.debug("read_rules: skipping as per job configuration") def _read_votes(self): self._logger.debug("read_votes: started") if self._config.vote_process_mode == 'process' and not self._config.source_ce: self._d.votes = self._dremio_env.list_votes()['data'] else: self._logger.debug("read_votes: skipping as per job configuration") def get_errors_count(self): return self._logger.errors_encountered
class DremioCascadeAcl: # Dremio Cloner Config, Logger, Utils _config = None _logger = None _utils = None _filter = None # Dremio Environment to write to _dremio_env = None # List of PDS for processing _pds_list = None def __init__(self, dremio, config): self._config = config self._dremio_env = dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) def cascade_acl(self): if not self._config.pds_list_useapi: self._pds_list = self._dremio_env.list_pds( self._config.source_filter, self._config.source_exclude_filter, self._config.source_folder_filter, self._config.source_folder_exclude_filter, self._config.pds_filter, self._config.pds_exclude_filter) self._logger.info( "cascade_acl: Not using API for PDS retrieval. Filtered PDS are NOT reported in the log." ) containers = self._dremio_env.list_catalog()['data'] for container in containers: self._logger.debug("cascade_acl: processing container " + self._utils.get_entity_desc(container)) if container[ 'containerType'] == "SPACE" and self._filter.match_space_filter( container): self._process_space(container) elif container[ 'containerType'] == "SOURCE" and self._filter.match_source_filter( container): self._process_source(container) def _process_space(self, space): entity = self._get_entity_definition(space) if entity is None: self._logger.error( "_process_space: error reading entity for container: " + self._utils.get_entity_desc(space)) else: if self._config.space_cascade_acl_origin_override_object is None: # Use Space ACL as an 'origin' self._logger.info( "_process_space: SPACE: '" + str(space['path']) + "' will be used as an ACL Origin for its children FOLDERs and VDSs." ) acl = self._get_acl(entity) else: # Use ACL from a configured object acl_entity = self._dremio_env.get_catalog_entity_by_path( self._config.space_cascade_acl_origin_override_object) if acl_entity is None: self._logger.error( "_process_space: error reading origin entity for path: " + str(self._config. space_cascade_acl_origin_override_object)) return self._logger.info( "_process_space: SPACE: '" + str(space['path']) + "' Using override origin instead as an ACL Origin for its children FOLDERs and VDSs." ) acl = self._get_acl(acl_entity) self._process_space_children(entity, acl) def _process_source(self, source): entity = self._get_entity_definition(source) if entity is None: self._logger.error( "_process_source: error reading entity for container: " + self._utils.get_entity_desc(source)) else: if self._config.source_cascade_acl_origin_override_object is None: # Use Source ACL as an 'origin' self._logger.info( "_process_source: SOURCE: '" + str(source['path']) + "' will be used as an ACL Origin for its children PDSs.") acl = self._get_acl(entity) else: # Use ACL from a configured object acl_entity = self._dremio_env.get_catalog_entity_by_path( self._config.source_cascade_acl_origin_override_object) if acl_entity is None: self._logger.error( "_process_source: error reading origin entity for path: " + str(self._config. source_cascade_acl_origin_override_object)) return self._logger.info( "_process_source: SOURCE: '" + str(source['path']) + "' Using override origin instead as an ACL Origin for its children PDSs." ) acl = self._get_acl(acl_entity) # Process PDSs if self._config.pds_list_useapi: self._process_source_children(entity, acl) else: for pds in self._pds_list: # Does the PDS belong to the current Source if pds['path'][0] == source['path'][0]: self._logger.debug("_process_source: pds: " + self._utils.get_entity_desc(pds)) if self._filter.match_pds_filter(pds): self._logger.debug( "_process_source_children: applying ACL to PDS: " + self._utils.get_entity_desc(pds)) self._apply_acl(pds, acl) def _process_source_children(self, parent_entity, acl): # This is a recursive function if 'children' not in parent_entity: return if 'entityType' not in parent_entity: self._logger.error( "_process_source_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return self._logger.debug( "_process_source_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") for child in parent_entity['children']: child_entity = self._get_entity_definition(child) if child_entity is None: self._logger.error( "_process_source_children: error reading entity for: " + self._utils.get_entity_desc(child)) if child['type'] == "DATASET": if self._filter.match_pds_filter(child_entity): self._logger.debug( "_process_source_children: applying ACL to PDS: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) else: self._logger.info( "_process_source_children: skipping PDS: " + str(child_entity['path']) + "as per filter configuration") elif child['type'] == "FILE": self._logger.info("_process_source_children: skipping FILE: " + self._utils.get_entity_desc(child_entity)) elif 'containerType' in child and child[ 'containerType'] == "FOLDER": if self._filter.match_source_folder_filter(child_entity): self._process_source_children(child_entity, acl) else: self._logger.info( "_process_source_children: skipping FOLDER: " + str(child_entity['path']) + "as per filter configuration") def _process_space_children(self, parent_entity, acl): # This is a recursive function if 'children' not in parent_entity: return if 'entityType' not in parent_entity: self._logger.error( "_process_space_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return self._logger.debug( "_process_space_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") for child in parent_entity['children']: child_entity = self._get_entity_definition(child) if child_entity is None: self._logger.error( "_process_space_children: error reading entity for: " + self._utils.get_entity_desc(child)) if child['type'] == "DATASET": if self._filter.match_vds_filter(child_entity): self._logger.debug( "_process_space_children: applying ACL to VDS: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) else: self._logger.info( "_process_space_children: skipping VDS: " + self._utils.get_entity_desc(child_entity)) elif child['containerType'] == "FOLDER": if self._filter.match_space_folder_filter(child_entity): if self._filter.match_space_folder_cascade_acl_origin_filter( child_entity): self._logger.info( "_process_space_children: FOLDER: " + str(child_entity['path']) + " will be used as an ACL Origin for its children.") self._process_space_children( child_entity, self._get_acl(child_entity)) else: self._logger.info( "_process_space_children: applying ACL to FOLDER: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) self._process_space_children(child_entity, acl) else: self._logger.info( "_process_space_children: skipping FOLDER: " + self._utils.get_entity_desc(child_entity)) self._process_space_children(child_entity, acl) def _get_entity_definition(self, src): if 'id' not in src: self._logger.error( "_read_entity_definition: bad data, skipping entity: " + self._utils.get_entity_desc(src)) return None else: entity = self._dremio_env.get_catalog_entity_by_id(src['id']) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for id: " + src['id']) return entity def _get_acl(self, entity): if 'accessControlList' in entity: return entity['accessControlList'] else: self._logger.fatal("ACL is not defined for " + self._utils.get_entity_desc(entity)) return None def _apply_acl(self, entity, acl): # Clear the current ACL definition if 'accessControlList' not in entity: entity['accessControlList'] = {"version": "0"} if 'users' in entity['accessControlList']: entity['accessControlList'].pop('users') if 'groups' in entity['accessControlList']: entity['accessControlList'].pop('groups') # Apply ACL to entity if 'users' in acl: entity['accessControlList']['users'] = acl['users'] if 'groups' in acl: entity['accessControlList']['groups'] = acl['groups'] if self._config.dry_run: self._logger.warn("_apply_acl: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity)) return False self._logger.info("_apply_acl: updating entity: " + self._utils.get_entity_desc(entity)) updated_entity = self._dremio_env.update_catalog_entity( entity['id'], entity, self._config.dry_run) if updated_entity is None: self._logger.error("_apply_acl: Error updating entity: " + self._utils.get_entity_desc(entity)) return False return True def get_errors_count(self): return self._logger.errors_encountered
class DremioDescribeJob: # Dremio Cloner Configuration, Utils, ... _config = None _utils = None _logger = None # Dremio Environment to write to _dremio_env = None # Working lists _pds_list = [] _vds_list = [] _final_sql = "" def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) def describe_job_sql_dependencies(self): sql = self._config.job_sql self._process_sql(sql) # Write output files pass a = 1 # recursive function def _process_sql(self, sql, sql_context=None): if sql_context is not None: schema = self._utils.normalize_path(sql_context) + "/" else: schema = "" paths = parse_sql.tables_in_query(sql) # Collect all PDS and VDS with the entire dependency hierarchy for path in paths: self._discover_dependencies(schema + path) # Create SQL statements for all dependencies for pds in self._pds_list: self._process_pds(pds) for vds in self._vds_list: self._process_vds(vds) # Write file self._write_file() def _discover_dependencies(self, path): dataset = self._dremio_env.get_catalog_entity_by_path(path) if dataset is not None: if dataset['type'] == 'VIRTUAL_DATASET': self._vds_list.append(dataset) elif dataset['type'] == 'PHYSICAL_DATASET': self._pds_list.append(dataset) return else: self._logger.fatal( "_discover_dependencies: Unknown Entity Type: " + dataset['type']) else: self._logger.fatal( "_discover_dependencies: Could not resolve dependency: " + path) # Process recursive dependencies sql_dependency_paths = parse_sql.tables_in_query(dataset['sql']) for dataset_dependency_path in sql_dependency_paths: sql_context = self._utils.get_sql_context(dataset) self._discover_dependencies( self._utils.get_absolute_path(dataset_dependency_path, sql_context)) def _process_pds(self, pds): fields = pds['fields'] sql_context = self._utils.get_sql_context(pds) name = pds['path'][-1:][0] stmt = 'CREATE TABLE ' + name + ' (' for field in fields: stmt = stmt + field['name'] + ' ' + field['type']['name'] + ', ' stmt = stmt[:-2] + ')' comment = '-- PDS: ' + self._utils.get_absolute_path( pds['path'], sql_context) self._final_sql = self._final_sql + comment + "\n" + stmt + ";\n\n" def _process_vds(self, vds): fields = vds['fields'] sql_context = self._utils.get_sql_context(vds) name = vds['path'][-1:][0] vds_sql = vds['sql'] stmt = 'CREATE VIEW ' + name + ' AS ' + vds_sql + ";\n" comment = '-- VDS: ' + self._utils.get_absolute_path( vds['path'], sql_context) self._final_sql = self._final_sql + comment + "\n" + stmt + ";\n\n" def _write_file(self): f = open(self._config.target_filename, "w") f.write(self._final_sql) f.close() def get_errors_count(self): return self._looger.errors_encountered
class DremioReportReflections: # Dremio Cloner Configuration, Utils, ... _config = None _utils = None _logger = None # Dremio object pointing to the source Dremio environment _dremio_env = None # Misc _delimeter = None _newline = None _report_reflections = [] def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._delimeter = self._config.report_csv_delimiter self._newline = self._config.report_csv_newline self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) def process_dremio_reflections(self): _query_reflections = self._retrieve_reflections() for query_reflection in _query_reflections: api_reflection = self._dremio_env.get_reflection( query_reflection['REFLECTION_ID']) dataset_entity = self._dremio_env.get_catalog_entity_by_path( self._normalize_dataset_path(query_reflection['DATASET'])) if dataset_entity is None: self._logger.error( "process_dremio_reflections: unable to retrieve dataset from API: " + query_reflection['DATASET']) source_pds_list = [] else: graph = self._dremio_env.get_catalog_entity_graph_by_id( dataset_entity['id']) if graph is None: self._logger.error( "process_dremio_reflections: unable to retrieve Graph for dataset from API: " + query_reflection['DATASET']) source_pds_list = [] elif len(graph['parents']) == 0: source_pds_list = [ self._utils.normalize_path(dataset_entity['path']) ] else: source_pds_list = list( self._get_dependency_pds_list(graph['parents'])) self._report_reflections.append({ 'ID': query_reflection['REFLECTION_ID'], 'NAME': query_reflection['NAME'], 'STATUS': query_reflection['STATUS'], 'TYPE': query_reflection['TYPE'], 'DATASET_PATH': query_reflection['DATASET'], 'MEASURES': query_reflection['measures'], 'DIMENSIONS': query_reflection['dimensions'], 'DISPLAY_COLUMNS': query_reflection['displayColumns'], 'SORT_COLUMNS': query_reflection['sortColumns'], 'PARTITION_COLUMNS': query_reflection['partitionColumns'], 'DISTRIBUTION_COLUMNS': query_reflection['distributionColumns'], 'EXTERNAL_REFLECTION': query_reflection['externalReflection'], 'NUM_FAILURES': query_reflection['NUM_FAILURES'], 'STATUS_EXTENDED': '' if api_reflection is None else api_reflection['status'], 'TOTAL_SIZE_BYTES': '' if api_reflection is None else api_reflection['totalSizeBytes'], 'ENABLED': '' if api_reflection is None else api_reflection['enabled'], 'PARTITION_DISTRIBUTION_STRATEGY': '' if api_reflection is None else api_reflection['partitionDistributionStrategy'], 'CREATED_AT': '' if api_reflection is None else api_reflection['createdAt'], 'UPDATED_AT': '' if api_reflection is None else api_reflection['updatedAt'], 'SOURCE_PDS_LIST': source_pds_list }) self.save_dremio_report_reflections() def _retrieve_reflections(self): sql = 'SELECT REFLECTION_ID, NAME, TYPE, STATUS, NUM_FAILURES, DATASET, sortColumns, partitionColumns, distributionColumns, dimensions, measures, displayColumns, externalReflection FROM SYS.REFLECTIONS ' jobid = self._dremio_env.submit_sql(sql) # Wait for the job to complete. Should only take a moment while True: job_info = self._dremio_env.get_job_info(jobid) self._logger.debug( "_retrieve_reflections: waiting for SQL query to finish. Job status: " + job_info["jobState"]) if job_info is None: self._logger.fatal( "_retrieve_reflections: unexpected error. Cannot get a list of Reflections." ) if job_info["jobState"] in ['CANCELED', 'FAILED']: self._logger.fatal( "_retrieve_reflections: unexpected error, SQL job failed. Cannot get a list of PDS." ) if job_info["jobState"] == 'COMPLETED': break time.sleep(1) # Retrieve list of PDS job_result = self._dremio_env.get_job_result(jobid) num_rows = int(job_result['rowCount']) if num_rows == 0: self._logger.warn("_retrieve_reflections: no Reflections found.") return self._logger.debug("_retrieve_reflections: processing " + str(num_rows) + " Reflectionss in batches of 100.") # Page through the results, 100 rows per page limit = 100 reflections = [] for i in range(0, int(num_rows / limit) + 1): self._logger.debug("_retrieve_reflections: processing batch " + str(i + 1)) job_result = self._dremio_env.get_job_result( jobid, limit * i, limit) for row in job_result['rows']: reflections.append(row) return reflections def _get_dependency_pds_list(self, parents): pds_set = set() for dataset in parents: if dataset['datasetType'] == 'PROMOTED' or dataset[ 'datasetType'] == 'DIRECT': pds_set.add(self._utils.normalize_path(dataset['path'])) elif dataset['datasetType'] == 'VIRTUAL': graph = self._dremio_env.get_catalog_entity_graph_by_id( dataset['id']) pds_set |= self._get_dependency_pds_list(graph['parents']) else: self._logger.fatal( "_gather_dependency_pds_list: unexpected entity type " + dataset['datasetType']) return pds_set def _get_optimization_confidence_pct(self, reflection): if len(reflection['SOURCE_PDS_LIST']) == 0: return 0 max_match_count = 0 for r in self._report_reflections: # Match only with another reflection of the same TYPE (RAW/AGGREGATION) if r == reflection or r['TYPE'] != reflection['TYPE']: continue match_count = 0 for s in r['SOURCE_PDS_LIST']: if s in reflection['SOURCE_PDS_LIST']: match_count = match_count + 1 if match_count > max_match_count: max_match_count = match_count return max_match_count * 100 / len(reflection['SOURCE_PDS_LIST']) def save_dremio_report_reflections(self): self._f = open(self._config.target_filename, "w") self._f.write('REFLECTION_ID' + self._delimeter + 'NAME' + self._delimeter + 'STATUS' + self._delimeter + 'TYPE' + self._delimeter + 'OPTIMIZATION_CONFIDENCE_PCT' + self._delimeter + 'DATASET_PATH' + self._delimeter + 'MEASURES' + self._delimeter + 'DIMENSIONS' + self._delimeter + 'DISPLAY_COLUMNS' + self._delimeter + 'SORT_COLUMNS' + self._delimeter + 'PARTITION_COLUMNS' + self._delimeter + 'DISTRIBUTION_COLUMNS' + self._delimeter + 'EXTERNAL_REFLECTION' + self._delimeter + 'NUM_FAILURES' + self._delimeter + 'STATUS_EXTENDED' + self._delimeter + 'TOTAL_SIZE_BYTES' + self._delimeter + 'ENABLED' + self._delimeter + 'PARTITION_DISTRIBUTION_STRATEGY' + self._delimeter + 'CREATED_AT' + self._delimeter + 'UPDATED_AT' + self._delimeter + 'SOURCE_PDS_LIST' + self._newline) for reflection in self._report_reflections: line = str(reflection['ID']) + self._delimeter + \ str(reflection['NAME']) + self._delimeter + \ str(reflection['STATUS']) + self._delimeter + \ str(reflection['TYPE']) + self._delimeter + \ str(self._get_optimization_confidence_pct(reflection)) + self._delimeter + \ str(reflection['DATASET_PATH']) + self._delimeter + \ str(reflection['MEASURES']) + self._delimeter + \ str(reflection['DIMENSIONS']) + self._delimeter + \ str(reflection['DISPLAY_COLUMNS']) + self._delimeter + \ str(reflection['SORT_COLUMNS']) + self._delimeter + \ str(reflection['PARTITION_COLUMNS']) + self._delimeter + \ str(reflection['DISTRIBUTION_COLUMNS']) + self._delimeter + \ str(reflection['EXTERNAL_REFLECTION']) + self._delimeter + \ str(reflection['NUM_FAILURES']) + self._delimeter + \ str(reflection['STATUS_EXTENDED']) + self._delimeter + \ str(reflection['TOTAL_SIZE_BYTES']) + self._delimeter + \ str(reflection['ENABLED']) + self._delimeter + \ str(reflection['PARTITION_DISTRIBUTION_STRATEGY']) + self._delimeter + \ str(reflection['CREATED_AT']) + self._delimeter + \ str(reflection['UPDATED_AT']) + self._delimeter + \ str(reflection['SOURCE_PDS_LIST']) + self._newline self._f.write(line) self._f.close() def _normalize_dataset_path(self, path): path = path.split('.') normalized_path = "" for i in range(0, len(path)): if path[i].startswith('"') and path[i].endswith('"'): normalized_path = normalized_path + path[i][1:-1] else: normalized_path = normalized_path + path[i] if normalized_path.startswith('"') and normalized_path.endswith( '"'): normalized_path = normalized_path[1:-1] entity = self._dremio_env.get_catalog_entity_by_path( normalized_path, report_error=False) if entity is not None: normalized_path = normalized_path + '/' else: normalized_path = normalized_path + '.' return normalized_path[:-1]
class DremioDelete: # Dremio Cloner Config, Logger, Utils _config = None _logger = None _utils = None _filter = None # Dremio Environment to write to _dremio_env = None # List of PDS for processing _pds_list = None def __init__(self, dremio, config): self._config = config self._dremio_env = dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) def delete(self): # Delete VDSs if (self._config.vds_process_mode != "delete"): self._logger.info( "delete: Not deleting VDS as per 'vds.process_mode' configuration" ) else: for vds_path in self._config.delete_vds: vds_json = self._dremio_env.get_catalog_entity_by_path( vds_path, report_error=True) if (vds_json is None): self._logger.error( "delete: unable to find VDS for path: '" + vds_path + "'") else: self._dremio_env.delete_catalog_entity( vds_json["id"], dry_run=self._config.dry_run, report_error=True) # Delete Folders if (self._config.folder_process_mode != "delete"): self._logger.info( "delete: Not deleting Folders as per 'folder.process_mode' configuration" ) else: for folder_path in self._config.delete_folders: folder_json = self._dremio_env.get_catalog_entity_by_path( folder_path, report_error=True) if (folder_json is None): self._logger.error( "delete: unable to find Folder for path: '" + folder_path + "'") else: self._dremio_env.delete_catalog_entity( folder_json["id"], dry_run=self._config.dry_run, report_error=True) def get_errors_count(self): return self._logger.errors_encountered
def __init__(self, config): self._config = config self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config)
class DremioClonerFilter(): _config = None _utils = None _logger = None def __init__(self, config): self._config = config self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) def is_pds_in_scope(self): return self._config._source_filter_re is not None and \ self._config._pds_filter_re is not None and \ self._config.source_folder_exclude_filter != '*' and \ self._config.pds_exclude_filter != '*' and \ self._config.pds_process_mode == 'process' def _match_listed_space_names(self, container): if self._config.space_filter_names != [] and ( \ ('path' in container and container['path'][0] not in self._config.space_filter_names) \ or ('name' in container and container['name'] not in self._config.space_filter_names) ): return False return True def match_space_filter(self, container, loginfo=False): if not self._match_listed_space_names(container): return False # Filter by space name pattern if self._match_path(self._config._space_filter_re, self._config._space_exclude_filter_re, None, None, None, None, container): return True if loginfo: self._logger.info("match_space_filter: skipping SPACE " + container['path'][0] if 'path' in container else container['name'] + " as per job configuration") return False def match_space_folder_filter(self, container, loginfo=True): if not self._match_listed_space_names(container): return False if self._match_path(self._config._space_filter_re, self._config._space_exclude_filter_re, self._config._space_folder_filter_re, self._config._space_folder_exclude_filter_re, None, None, container): return True if loginfo: self._logger.debug( "match_space_folder_filter: skipping SPACE FOLDER " + container['path'][0] if 'path' in container else container['name'] + " as per job configuration") return False def match_space_folder_cascade_acl_origin_filter(self, container): if self._config.space_folder_cascade_acl_origin_filter is None: return False elif ( # Do not filter out folders in HOME hierarchies (container['path'][0][:1] == '@') or # Match both Folder filter and Space filter ((self._config._space_folder_cascade_acl_origin_filter_re.match( self._utils.normalize_path(container['path'][1:])) is not None) and self.match_space_filter(container))): return True else: return False def match_source_filter(self, container, loginfo=True): # First filter by source types if container['type'] != 'CONTAINER' and self._config.source_filter_types != [] and (container['entityType'] != 'source' or container['type'] not in self._config.source_filter_types): return False # Also filter by source names if container['type'] != 'CONTAINER' and self._config.source_filter_names != [] and (container['entityType'] != 'source' or container['name'] not in self._config.source_filter_names): return False # Finally filter by filter pattern if self._match_path(self._config._source_filter_re, self._config._source_exclude_filter_re, None, None, None, None, container): return True if loginfo: self._logger.debug("match_source_filter: skipping SOURCE " + container['path'][0] if 'path' in container else container['name'] + " as per job configuration") return False def match_source_folder_filter(self, container, loginfo=True): if self._match_path(self._config._source_filter_re, self._config._source_exclude_filter_re, self._config._source_folder_filter_re, self._config._source_folder_exclude_filter_re, None, None, container): return True if loginfo: self._logger.debug( "match_source_folder_filter: skipping SOURCE FOLDER " + container['path'][0] if 'path' in container else container['name'] + " as per job configuration") return False def match_pds_filter(self, pds, loginfo=True): if self._match_path(self._config._source_filter_re, self._config._source_exclude_filter_re, self._config._source_folder_filter_re, self._config._source_folder_exclude_filter_re, self._config._pds_filter_re, self._config.pds_exclude_filter, pds): return True if loginfo: self._logger.debug("match_pds_filter: skipping PDS " + pds['path'][-1] if 'path' in pds else pds['name'] + " as per job configuration") return False def match_vds_filter(self, vds, tags=None, loginfo=True): if not self._match_listed_space_names(vds): return False if self._match_path(self._config._space_filter_re, self._config._space_exclude_filter_re, self._config._space_folder_filter_re, self._config._space_folder_exclude_filter_re, self._config._vds_filter_re, self._config._vds_exclude_filter_re, vds): if self._config.vds_filter_tag is None or self._config.vds_filter_tag == "*": return True elif tags is not None and self._match_tag(tags): return True if loginfo: self._logger.debug("match_vds_filter: skipping VDS " + vds['path'][-1] if 'path' in vds else vds['name'] + " as per job configuration") return False def _match_tag(self, tags): if 'tags' not in tags: return False for tag in tags['tags']: if tag == self._config.vds_filter_tag: return True return False def match_reflection_path(self, reflection_path, reflection_dataset): if 'type' in reflection_dataset and reflection_dataset[ 'type'] == 'VIRTUAL_DATASET': if self._match_hierarchy_path( self._config._space_filter_re, self._config._space_exclude_filter_re, self._config._space_folder_filter_re, self._config._space_folder_exclude_filter_re, self._config._vds_filter_re, self._config._vds_exclude_filter_re, reflection_path): return True else: if self._match_hierarchy_path( self._config._source_filter_re, self._config._source_exclude_filter_re, self._config._source_folder_filter_re, self._config._source_folder_exclude_filter_re, self._config._pds_filter_re, self._config._pds_exclude_filter_re, reflection_path): return True return False def _match_hierarchy_path(self, root_re, root_exclusion_re, folder_re, folder_exclusion_re, object_re, object_exclusion_re, hierarchy_path): if root_re is None: return False # Match root object (Space of Source) if root_re.match(hierarchy_path[0]) is None: return False if root_exclusion_re is not None and root_exclusion_re.match( hierarchy_path[0]) is not None: return False # Match object if object_re is not None and object_re.match( self._utils.normalize_path(hierarchy_path[-1])) is None: return False if object_exclusion_re is not None and object_exclusion_re.match( self._utils.normalize_path(hierarchy_path[1:])) is not None: return False # Match Folders. Note, child folders do not need to be matched if its parent match if folder_re is None: return False else: folder_matched = False for i in range(len(hierarchy_path)): if folder_re.match( self._utils.normalize_path( hierarchy_path[1:len(hierarchy_path) - i])) is not None: folder_matched = True break if not folder_matched: return False if folder_exclusion_re is not None: folder_exclusion_matched = False for i in range(len(hierarchy_path)): if folder_exclusion_re.match( self._utils.normalize_path( hierarchy_path[1:len(hierarchy_path) - i])) is not None: folder_exclusion_matched = True break if folder_exclusion_matched: return False return True def _match_path(self, root_re, root_exclusion_re, folder_re, folder_exclusion_re, object_re, object_exclusion_re, entity): # If inclusion filter is not specified, nothing to process if root_re is None: return False # Validate parameters if ('containerType' in entity and entity['containerType'] == 'SPACE') or \ ('entityType' in entity and entity['entityType'] == 'space') or \ ('containerType' in entity and entity['containerType'] == 'SOURCE') or \ ('entityType' in entity and entity['entityType'] == 'source') : pass elif ('entityType' in entity and entity['entityType'] == 'folder') or \ ('containerType' in entity and entity['containerType'] == 'FOLDER'): if root_re is None: # Not validating folder_re as the call might be to validate if the folder is from the unfiltered space return False elif ('entityType' in entity and entity['entityType'] == 'dataset') or \ ('type' in entity and entity['type'] == 'DATASET'): if root_re is None: # Not validating folder_re, object_re as the call might be to validate if the folder is from the unfiltered space return False else: self._logger.fatal("_match_path: Unexpected Entity Type " + str(entity)) if 'path' not in entity: return root_exclusion_re is None or root_exclusion_re.match( entity['name']) else: path = entity['path'] # Match root object (Space of Source) if root_re.match(path[0]) is None: return False if root_exclusion_re is not None and root_exclusion_re.match( path[0]) is not None: return False # Match object if object_re is not None and object_re.match( self._utils.normalize_path(path[-1])) is None: return False if object_exclusion_re is not None and object_exclusion_re.match( self._utils.normalize_path(path[1:])) is not None: return False # Match Folders. Note, child folders do not need to be matched if its parent match if folder_re is not None or folder_exclusion_re is not None: folder_matched = False for i in range(len(path)): if folder_re.match( self._utils.normalize_path(path[1:len(path) - i])) is not None: folder_matched = True break if not folder_matched: return False if folder_exclusion_re is not None: folder_exclusion_matched = False for i in range(len(path)): if folder_exclusion_re.match( self._utils.normalize_path( path[1:len(path) - i])) is not None: folder_exclusion_matched = True break if folder_exclusion_matched: return False return True