Пример #1
0
	def __init__(self, target_dremio, dremio_data, config):
		self._config = config
		self._dremio_env = target_dremio
		self._d = dremio_data
		self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose)
		self._filter = DremioClonerFilter(config)
		self._utils = DremioClonerUtils(config)
Пример #2
0
    def __init__(self, config_file_name):
        # Read configuration file
        if sys.version_info.major > 2:
            f_open = lambda filename: open(filename, "r", encoding='utf-8')
        else:
            f_open = lambda filename: open(filename, "r")

        f = f_open(config_file_name)

        self.cloner_conf_json = json.load(f)['dremio_cloner']
        f.close()
        for element in self.cloner_conf_json:
            if 'command' in element:
                self._process_command(element)
            elif 'source' in element:
                self._process_source(element)
            elif 'target' in element:
                self._process_target(element)
            elif 'options' in element:
                self._process_options(element)
        logging.basicConfig(format=self.logging_format,
                            level=self.logging_level,
                            filename=self.logging_filename)
        self._logger = DremioClonerLogger(self.max_errors,
                                          self.logging_verbose)
        self._validate_configuration()
Пример #3
0
 def __init__(self, source_dremio, config):
     self._config = config
     self._dremio_env = source_dremio
     self._logger = DremioClonerLogger(self._config.max_errors,
                                       self._config.logging_verbose)
     self._utils = DremioClonerUtils(config)
     self._filter = DremioClonerFilter(config)
Пример #4
0
 def __init__(self, source_dremio, config):
     self._config = config
     self._dremio_env = source_dremio
     self._delimeter = self._config.report_csv_delimiter
     self._newline = self._config.report_csv_newline
     self._logger = DremioClonerLogger(self._config.max_errors,
                                       self._config.logging_verbose)
     self._utils = DremioClonerUtils(config)
Пример #5
0
	def __init__(self, config_file_name):
		# Read configuration file
		f = open(config_file_name, "r")
		self.cloner_conf_json = json.load(f)['dremio_cloner']
		f.close()
		for element in self.cloner_conf_json:
			if 'command' in element:
				self._process_command(element)
			elif 'source' in element:
				self._process_source(element)
			elif 'target' in element:
				self._process_target(element)
			elif 'options' in element:
				self._process_options(element)
		logging.basicConfig(format=self.logging_format, level=self.logging_level, filename=self.logging_filename)
		self._logger = DremioClonerLogger(self.max_errors, self.logging_verbose)
		self._validate_configuration()
Пример #6
0
class DremioWriter:

	# Dremio Cloner Config, Utils, ...
	_config = None
	_utils = None
	_logger = None
	_filter = None

	# Dremio Environment to write to
	_dremio_env = None

	# Dremio Data to write
	_d = None

	# VDS list grouped by hierarchy
	_vds_hierarchy = []
	_hierarchy_depth = 0
	_unresolved_vds = []

	# Referenced Users and Groups in the target environment
	_target_dremio_users = []
	_target_dremio_groups = []

	# Resolved Datasets for Reflections
	_existing_reflections = list()

	# Dry run collections
	_dry_run_processed_vds_list = []
	_dry_run_processed_pds_list = []

	def __init__(self, target_dremio, dremio_data, config):
		self._config = config
		self._dremio_env = target_dremio
		self._d = dremio_data
		self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose)
		self._filter = DremioClonerFilter(config)
		self._utils = DremioClonerUtils(config)

	def write_dremio_environment(self):
		self._retrieve_users_groups()
		if self._config.acl_transformation != {} and self._d.referenced_users == [] and self._d.referenced_groups == []:
			self._logger.warn("ACL Transformation has been defined while Referenced Users and Referenced Groups are not present in the Source Dremio Data.")

		if self._config.reflection_process_mode != 'skip':
			self._existing_reflections = self._dremio_env.list_reflections()['data']
		if self._config.source_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping source processing due to configuration source.process_mode=skip.")
		else:
			for source in self._d.sources:
				self._write_source(source, self._config.source_process_mode, self._config.source_ignore_missing_acl_user, self._config.source_ignore_missing_acl_group)
		if self._config.pds_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping source PDS processing due to configuration source.pds.process_mode=skip.")
		else:
			for pds in self._d.pds_list:
				self._write_pds(pds, self._config.pds_process_mode, self._config.pds_ignore_missing_acl_user, self._config.pds_ignore_missing_acl_group)
		if self._config.space_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping space processing due to configuration space.process_mode=skip.")
		else:
			for space in self._d.spaces:
				self._write_space(space, self._config.space_process_mode, self._config.space_ignore_missing_acl_user, self._config.space_ignore_missing_acl_group)
		if self._config.folder_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping folder processing due to configuration folder.process_mode=skip.")
		else:
			for folder in self._d.folders:
				self._write_folder(folder, self._config.folder_process_mode, self._config.folder_ignore_missing_acl_user, self._config.folder_ignore_missing_acl_group)
		if self._config.vds_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping VDS processing due to configuration vds.process_mode=skip.")
		else:
			self._order_vds(0)
			self._write_vds_hierarchy()
			self._write_remainder_vds()
		if self._config.reflection_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping reflection processing due to configuration reflection.process_mode=skip.")
		else:
			for reflection in self._d.reflections:
				self._write_reflection(reflection, self._config.reflection_process_mode)
		if self._config.reflection_refresh_mode != 'refresh':
			self._logger.info("write_dremio_environment: Skipping reflection refresh due to configuration reflection.refresh_mode=skip.")
		else:
			for pds in self._d.pds_list:
				self._dremio_env.refresh_reflections_by_pds_path(self._utils.normalize_path(pds['path']), self._config.dry_run)
		if self._config.wiki_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping wiki processing due to configuration wiki.process_mode=skip.")
		else:
			for wiki in self._d.wikis:
				self._write_wiki(wiki, self._config.wiki_process_mode)
		if self._config.tag_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping tag processing due to configuration tag.process_mode=skip.")
		else:
			for tags in self._d.tags:
				self._write_tags(tags, self._config.tag_process_mode)

	def _write_space(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		if self._filter.match_space_filter(entity):
			self._logger.debug("_write_space: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_space: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _write_source(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		if self._filter.match_source_filter(entity):
			self._logger.debug("_write_source: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_source: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _write_folder(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		# Drop ACL for HOME folders
		if entity['path'][0][:1] == '@' and 'accessControlList' in entity:
			entity.pop("accessControlList")
		# Do not apply space.folder.filter to Home folders
		if entity['path'][0][:1] == '@' or self._filter.match_space_folder_filter(entity):
			self._logger.debug("_write_folder: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_folder: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _retrieve_users_groups(self):
		for user in self._d.referenced_users:
			target_user = self._dremio_env.get_user_by_name(user['name'])
			if target_user is not None:
				self._target_dremio_users.append(target_user)
			else:
				self._logger.error("_retrieve_users_groups: Unable to resolve user in target Dremio environment: " + str(user['name']))
		for group in self._d.referenced_groups:
			target_group = self._dremio_env.get_group_by_name(group['name'])
			if target_group is not None:
				self._target_dremio_groups.append(target_group)
			else:
				self._logger.error("_retrieve_users_groups: Unable to resolve group in target Dremio environment: " + str(group['name']))
		# Retrieve acl transformation target users and groups
		for item in self._config.acl_transformation:
			if 'user' in item['target']:
				user = self._dremio_env.get_user_by_name(item['target']['user'])
				if user is not None:
					# dont worry about dups
					self._target_dremio_users.append(user)
				else:
					self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION user in target Dremio environment: " + str(item['target']['user']))
			if 'group' in item['target']:
				group = self._dremio_env.get_group_by_name(item['target']['group'])
				if group is not None:
					# dont worry about dups
					self._target_dremio_groups.append(group)
				else:
					self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION group in target Dremio environment: " + str(item['target']['group']))

	def _write_vds_hierarchy(self):
		for level in range(0, self._hierarchy_depth):
			for item in self._vds_hierarchy:
				if item[0] == level:
					vds = item[1]
					if self._filter.match_vds_filter(vds):
						self._logger.debug("_write_vds_hierarchy: writing vds: " + self._utils.get_entity_desc(vds))
						self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group)

	def _write_remainder_vds(self):
		if not self._d.vds_list and not self._unresolved_vds:
			return
		else:
			self._logger.info("_write_remainder_vds: Attempt processing VDSs that failed ordering.")
		# Attempt to process max_hierarchy_depth
		for h in range(1, self._config.vds_max_hierarchy_depth):
			# These are VDSs that have all dependencies validated but could not be placed in the hierarchy
			# Go with decreasing index so we can remove VDS from the list
			for i in range(len(self._d.vds_list) - 1, -1, -1):
				vds = self._d.vds_list[i]
				if self._filter.match_vds_filter(vds):
					self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds))
					if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False):
						self._d.vds_list.remove(vds)
				else:
					self._d.vds_list.remove(vds)
			# Iterate through the remainder of unresolved VDS in the list
			# Go with decreasing index so we can remove VDS from the list
			for i in range(len(self._unresolved_vds) - 1, -1, -1):
				vds = self._unresolved_vds[i]
				if self._filter.match_vds_filter(vds):
					self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds))
					if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False):
						self._unresolved_vds.remove(vds)
				else:
					self._unresolved_vds.remove(vds)
		if self._d.vds_list != [] or self._unresolved_vds != []:
			self._logger.warn('_write_remainder_vds: After attempting to process VDSs that failed ordering, the following VDSs still failed. Set log level to DEBUG and see prior error messages for more information.')
			for vds in self._d.vds_list:
				self._logger.error("Failed VDS: " + str(vds['path']))
			for vds in self._unresolved_vds:
				self._logger.error("Failed VDS: " + str(vds['path']))
		else:
			self._logger.warn("_write_remainder_vds: Finished processing VDSs that failed ordering. All VDSs have been successfuly processed.")


	def _write_user(self):
		if self._config.user_process_mode == 'skip':
			self._logger.info("_write_user: Skipping user processing due to configuration user.process_mode=skip.")
			return True
		self._logger.error("_write_user: Cannot create users. API is not implemented.")

	def _write_entity(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag, report_error = True):
		self._logger.debug("_write_entity: processing entity: " + self._utils.get_entity_desc(entity))
		# Clean up the definition
		if 'id' in entity:
			entity.pop("id")
		if 'tag' in entity:
			entity.pop("tag")
		if 'children'in entity:
			entity.pop("children")
		if 'createdAt' in entity:
			entity.pop("createdAt")
		# Process ACL as needed
		if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
			# Skip this entity due to ACL processing errors
			self._logger.info("_write_entity: Skipping entity due to ignore_missing_acl_user_flag, ignore_missing_acl_group_flag: " + self._utils.get_entity_desc(entity))
			return False
		# Check if the entity already exists
		existing_entity = self._read_entity_definition(entity)
		# Ensure we have not received FOLDER instead of DATASET. See DX-16666
		if existing_entity is not None and 'entityType' in entity and \
				'entityType' in existing_entity and entity['entityType'] != existing_entity['entityType']:
			existing_entity = None
		if existing_entity is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_entity: Skipping entity creation due to configuration process_mode=update_only. " + self._utils.get_entity_desc(entity))
				return True
			# Reset version for proper concurrency
			if 'accessControlList' in entity:
				entity['accessControlList']['version'] = "0"
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Creating entity: " + self._utils.get_entity_desc(entity))
				# For dry run, keep it in a seperate collection to suppress errors
				if self._utils.is_vds(entity):
					self._dry_run_processed_vds_list.append(entity)
				return False
			# Note for the CE target env, the ACL should have been popped out by _process_acl
			new_entity = self._dremio_env.create_catalog_entity(entity, self._config.dry_run)
			if new_entity is None:
				if report_error:
					self._logger.error("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity))
				else:
					self._logger.debug("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity))
				return False
		else:  # Entity already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_entity: Found existing entity and process_mode is set to create_only. Skipping entity: " + self._utils.get_entity_desc(entity))
				return True
			self._logger.debug("_write_entity: Overwriting entity definition as per process_mode configuration : " + self._utils.get_entity_desc(entity))
			# Update entity definition with data from entity existing in the target environment
			entity['id'] = existing_entity['id']
			entity['tag'] = existing_entity['tag']  # Tag from the entity existing in the target environment required for proper concurrency control
			# Update ACL version for proper concurrency control, but do not use ACL if not really needed as HOME folders are not allowed to have ACL
			if ('path' in entity and entity['path'][0][:1] == '@') or ('name' in entity and entity['name'][:1] == '@'): 
				if 'accessControlList' in entity:
					entity.pop('accessControlList')
			else:
				# Note for the CE target env, the ACL should have been popped out by _process_acl
				if not self._config.target_ce:
					if 'accessControlList' not in entity:
						entity['accessControlList'] = {"version": "0"}
					# API changed behavior around version 4 and may not return version attribute for ACL.
					if 'accessControlList' in existing_entity and 'version' in existing_entity['accessControlList']:
						entity['accessControlList']['version'] = existing_entity['accessControlList']['version']
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity))
				return False
			updated_entity = self._dremio_env.update_catalog_entity(entity['id'], entity, self._config.dry_run, report_error)
			if updated_entity is None:
				if report_error:
					self._logger.error("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity))
				else:
					self._logger.debug("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity))
				return False
		return True

	def _write_pds(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_write_pds: processing entity: " + self._utils.get_entity_desc(entity))
		if self._filter.match_pds_filter(entity):
			existing_entity = self._read_entity_definition(entity)
			if existing_entity is None:
				self._logger.error("_write_pds: Cannot find existing entity for PDS Entity. Either Folder, File, or PDS must exist prior to promoting or updating PDS. Source PDS: " + self._utils.get_entity_desc(entity))
				return False	
			# Check if PDS needs to be promoted first
			if 'type' not in existing_entity or existing_entity['type'] != 'PHYSICAL_DATASET':
				self._promote_pds(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
			# Update PDS now
			self._logger.debug("_write_pds: writing pds: " + self._utils.get_entity_desc(entity))
			self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			return None

	def _promote_pds(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_promote_pds: processing entity: " + self._utils.get_entity_desc(entity))
		# Clean up the definition
		if 'id' in entity:
			entity.pop("id")
		if 'tag' in entity:
			entity.pop("tag")
		if 'children'in entity:
			entity.pop("children")
		if 'createdAt' in entity:
			entity.pop("createdAt")
		# Process ACL as needed
		if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
			# Skip this entity due to ACL processing errors
			self._logger.error("_promote_pds: Skipping PDS due to an error in ACL processing: " + self._utils.get_entity_desc(entity))
			return False
		# Read exisitng folder or file entity
		fs_entity = self._read_entity_definition(entity)
		if fs_entity is None:
			self._logger.error("_promote_pds: Skipping PDS. Cannot find folder or file for PDS Entity: " + self._utils.get_entity_desc(entity))
			return False
		# Add Folder ID to PDS Entity	
		entity['id'] = fs_entity['id']
		if 'accessControlList' in entity: 
			entity.pop('accessControlList')
		if self._config.dry_run:
			self._logger.warn("_promote_pds: Dry Run, NOT promoting pds: " + self._utils.get_entity_desc(entity))
			return True
		self._logger.debug("_promote_pds: promoting pds: " + self._utils.get_entity_desc(entity))
		new_pds_entity = self._dremio_env.promote_pds(entity, self._config.dry_run)
		if new_pds_entity is None:
			self._logger.error("_promote_pds: Error promoting PDS: " + self._utils.get_entity_desc(entity))
			return False
		return True


	def _write_reflection(self, reflection, process_mode):
		self._logger.debug("_write_reflection: processing reflection: " + self._utils.get_entity_desc(reflection))
		# Clean up the definition
		if 'id' in reflection:
			reflection.pop("id")
		if 'tag' in reflection:
			reflection.pop("tag")
		if 'createdAt' in reflection:
			reflection.pop("createdAt")
		if 'updatedAt' in reflection:
			reflection.pop("updatedAt")
		if 'currentSizeBytes' in reflection:
			reflection.pop("currentSizeBytes")
		if 'totalSizeBytes' in reflection:
			reflection.pop("totalSizeBytes")
		if 'status' in reflection:
			reflection.pop("status")
		reflection_path = reflection['path']
		# Write Reflection
		reflection.pop("path")
		reflected_dataset = self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(reflection_path))
		if reflected_dataset is None:
			self._logger.error("_write_reflection: Could not resolve dataset for " + self._utils.get_entity_desc(reflection))
			return None
		# Match filters if requested
		if self._config.reflection_filter_mode == "apply_vds_pds_filter":
			if not self._filter.match_reflection_path(reflection_path, reflected_dataset):
				return False
		reflection['datasetId'] = reflected_dataset['id']
		# Check if the reflection already exists
		existing_reflection = self._find_existing_reflection(reflection, reflected_dataset)
		if existing_reflection is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_reflection: Skipping reflection creation due to configuration reflection_process_mode. " + self._utils.get_entity_desc(reflection))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_reflection: Dry Run, NOT Creating reflection: " + self._utils.get_entity_desc(reflection))
				return None
			new_reflection = self._dremio_env.create_reflection(reflection, self._config.dry_run)
			if new_reflection is None:
				self._logger.error("_write_reflection: could not create " + self._utils.get_entity_desc(reflection))
				return None
		else:  # Reflection already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_reflection: Found existing refleciton and reflection_process_mode is set to create_only. Skipping " + self._utils.get_entity_desc(reflection))
				return None
			# make sure there are changes to update as it will invalidate existing reflection data
			if reflection['type'] == existing_reflection['type'] and \
				reflection['name'] == existing_reflection['name'] and \
				('partitionDistributionStrategy' in reflection and reflection['partitionDistributionStrategy'] == existing_reflection['partitionDistributionStrategy']) and \
			    ('measureFields' in reflection and reflection['measureFields'] == existing_reflection['measureFields']) and \
				('dimensionFields' in reflection and reflection['dimensionFields'] == existing_reflection['dimensionFields']) and \
				('displayFields' in reflection and reflection['displayFields'] == existing_reflection['displayFields']) and \
				('sortFields' in reflection and reflection['sortFields'] == existing_reflection['sortFields']) and \
				('partitionFields' in reflection and reflection['partitionFields'] == existing_reflection['partitionFields']) and \
				('distributionFields' in reflection and reflection['distributionFields'] == existing_reflection['distributionFields']):
				# Nothing to do
				self._logger.debug("_write_reflection: No pending changes. Skipping " + self._utils.get_entity_desc(reflection))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Updating " + self._utils.get_entity_desc(reflection))
				return False
			self._logger.debug("_write_reflection: Overwriting " + self._utils.get_entity_desc(reflection))
			reflection['tag'] = existing_reflection['tag']
			updated_reflection = self._dremio_env.update_reflection(existing_reflection['id'], reflection, self._config.dry_run)
			if updated_reflection is None:
				self._logger.error("_write_reflection: Error updating " + self._utils.get_entity_desc(reflection))
				return False
		return True


	def _find_existing_reflection(self, reflection, dataset):
		for existing_reflection in self._existing_reflections:
			# Match reflections by name
			if reflection['name'] == existing_reflection['name']:
				existing_dataset = self._dremio_env.get_catalog_entity_by_id(existing_reflection['datasetId'])
				# Match reflections by respective dataset's path
				if existing_dataset is not None and existing_dataset['path'] == dataset['path']:
					return existing_reflection
		return None


	def _find_existing_dataset_by_path(self, path):
		return self._dremio_env.get_catalog_entity_by_path(path)


# Searches for Users from entity's ACL in the target environment and either:
	# - removes the user from ACL if not found and ignore_missing_acl_user_flag is set 
	# - returns False if if not found and ignore_missing_acl_user_flag is not set
	# - updates the ACL with userid from the new environment if User found there 
	def _process_acl(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_process_acl: processing entity: " + self._utils.get_entity_desc(entity))
		if 'accessControlList' not in entity:
			return True
		if self._config.target_ce:
			entity.pop('accessControlList')
			return True
		acl = entity['accessControlList']
		transformed_acl = {"users": [], "groups": []}
		if 'version' in entity:
			acl.pop('version')
		if acl == {} or ('users' not in acl and 'groups' not in acl):
			pass
		else:
			if 'users' in acl:
				# Note, taking a copy of the list for proper removal of items
				for user_def in acl['users'][:]:
					new_acl_principal = self._find_matching_principal_for_userid(user_def['id'], user_def['permissions'])
					if new_acl_principal == "REMOVE":
						self._logger.info("_process_acl: Source User " + user_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity))
					elif new_acl_principal is None:
						if ignore_missing_acl_user_flag:
							self._logger.warn("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. User is removed from ACL definition as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity))
						else:
							self._logger.error("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity))
					elif "user" in new_acl_principal:
						transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']})
					elif "group" in new_acl_principal:
						transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']})
			if 'groups' in acl:
				# Note, taking a copy of the list for proper removal of items
				for group_def in acl['groups'][:]:
					new_acl_principal = self._find_matching_principal_for_groupid(group_def['id'], group_def['permissions'])
					if new_acl_principal == "REMOVE":
						self._logger.info("_process_acl: Source Group " + group_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity))
					elif new_acl_principal is None:
						if ignore_missing_acl_group_flag:
							self._logger.warn("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. Group is removed from ACL definition as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity))
						else:
							# Flag is not set - return error status
							self._logger.error("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity))
					elif "user" in new_acl_principal:
						transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']})
					elif "group" in new_acl_principal:
						transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']})
			entity['accessControlList'] = transformed_acl
		return True

	def _transform_permissions(self, source_permissions, acl_mapping):
		# if permission mapping not explicitely defined, use source permissions as is
		if 'permission-mapping' not in acl_mapping:
			return source_permissions
		permissions_mapping = acl_mapping['permission-mapping']
		# READ is required for WRITE, so READ is always present in the list of permissions
		permissions = ["READ"]
		for permission in source_permissions:
			for mapping in permissions_mapping:
				# add only once
				if permission in mapping and mapping[permission] not in permissions:
					permissions.append(mapping[permission])
		return permissions

	def _find_matching_principal_for_userid(self, userid, permissions):
		self._logger.debug("_find_matching_principal_for_userid: processing user_id: " + str(userid))
		for user in self._d.referenced_users:
			if user['id'] == userid:
				transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions)
				if transformed_principal == "REMOVE":
					self._logger.info("_find_matching_principal_for_userid: Source User " + user['name'] + " [" + user['id'] + "] is mapped as NONE.")
					return "REMOVE"
				# If no tranformation is defined for this user
				elif transformed_principal is None:
					for target_user in self._target_dremio_users:
						if target_user['name'] == user['name']:
							return {"user":target_user['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		# If the username is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take username straight from target
		for user in self._target_dremio_users:
			if user['id'] == userid:
				transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions)
				if transformed_principal is None:
					return {"user": user['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		return None

	def _find_acl_transformation_by_username(self, username, permissions):
		for item in self._config.acl_transformation:
			if 'user' in item['source'] and item['source']['user'] == username:
				if "REMOVE" in item['target']:
					return "REMOVE"
				elif "user" in item['target']:
					for target_user in self._target_dremio_users:
						if target_user['name'] == item['target']['user']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"user":target_user['id'],"permissions":new_permissions}
				elif "group" in item['target']:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == item['target']['group']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"group":target_group['id'],"permissions":new_permissions}
				# The transformation is defined for this user, however, the target principal is not in the target Dremio Environment
				return {"error": "user_transformation_found_but_target_principle_is_not_in_target_dremio_environment"}
		# If the username is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take username straight from target
		for item in self._config.acl_transformation:
			if 'user' in item['target'] and item['target']['user'] == username:
				for target_user in self._target_dremio_users:
					if target_user['name'] == username:
						new_permissions = self._transform_permissions(permissions, item)
						return {"user": target_user['id'], "permissions": new_permissions}
			if 'group' in item['target'] and item['target']['group'] == username:
				for target_group in self._target_dremio_groups:
					if target_group['name'] == item['target']['group']:
						new_permissions = self._transform_permissions(permissions, item)
						return {"group": target_group['id'], "permissions": new_permissions}
		return None

	def _find_matching_principal_for_groupid(self, groupid, permissions):
		self._logger.debug("_find_matching_groupid: processing: " + str(groupid))
		for group in self._d.referenced_groups:
			if group['id'] == groupid:
				transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions)
				if transformed_principal == "REMOVE":
					self._logger.info("_find_matching_principal_for_groupid: Source Group " + group['name'] + " [" + group['id'] + "] is mapped as NONE.")
					return "REMOVE"
				# If no transformation is defined for this group
				elif transformed_principal is None:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == group['name']:
							return {"group":target_group['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_groupid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		# If the group name is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take group name straight from target
		for group in self._target_dremio_groups:
			if group['id'] == groupid:
				transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions)
				if transformed_principal is None:
					return {"user": group['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		return None


	def _find_acl_transformation_by_groupname(self, groupname, permissions):
		for item in self._config.acl_transformation:
			if 'group' in item['source'] and item['source']['group'] == groupname:
				if "REMOVE" in item['target']:
					return "REMOVE"
				elif "user" in item['target']:
					for target_user in self._target_dremio_users:
						if target_user['name'] == item['target']['user']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"user":target_user['id'],"permissions":new_permissions}
				elif "group" in item['target']:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == item['target']['group']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"group":target_group['id'],"permissions":new_permissions}
				# The transformation is defined for this group, however, the target principal is not in the target Dremio Environment
				return {"error": "group_transformation_found_but_target_principle_is_not_in_target_dremio_environment"}
		# If the group name is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take group name straight from target
		for item in self._config.acl_transformation:
			if 'user' in item['target'] and item['target']['user'] == groupname:
				for target_user in self._target_dremio_users:
					if target_user['name'] == groupname:
						new_permissions = self._transform_permissions(permissions, item)
						return {"user": target_user['id'], "permissions": new_permissions}
			if 'group' in item['target'] and item['target']['group'] == groupname:
				for target_group in self._target_dremio_groups:
					if target_group['name'] == item['target']['group']:
						new_permissions = self._transform_permissions(permissions, item)
						return {"group": target_group['id'], "permissions": new_permissions}
		return None

	def _read_entity_definition(self, entity):
		self._logger.debug("_read_entity_definition: processing entity: " + self._utils.get_entity_desc(entity))
		if 'name' in entity:
			return self._dremio_env.get_catalog_entity_by_path(entity['name'])
		elif 'path' in entity:
			return self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(entity['path']))
		else:
			self._logger.error("_read_entity_definition: bad data: " + self._utils.get_entity_desc(entity))
			return None

	# Process vds_list and save ordered list of VDSs into _vds_hierarchy. Recursive method.
	def _order_vds(self, processing_level=0):
		# Verify for the Max Hierarchy Depth
		if processing_level >= self._config.vds_max_hierarchy_depth:
			self._logger.debug("_order_vds: Finished processing with VDSs left to process:" + str(self._d.vds_list))
			return
		any_vds_leveled = False
		# Iterate through the remainder VDS in the list
		# Go with decreasing index so we can remove VDS from the list
		for i in range(len(self._d.vds_list) - 1, -1, -1):
			vds = self._d.vds_list[i]
			self._logger.debug("_order_vds: processing vds " + self._utils.get_entity_desc(vds))
			vds_hierarchy_level = processing_level
			any_dependency_unresolved = False
			sql_dependency_paths = self._get_vds_dependency_paths(vds)
			# Iterate through SQL dependencies to determine level of hierarchy for each dependency and the VDS
			for path in sql_dependency_paths:
				self._logger.debug("_order_vds: processing sql dependency " + path)
				# Validate the dependency against VDS and PDS
				sql_context = self._utils.get_sql_context(vds)
				dependency_vds = self._find_vds_by_path(self._utils.get_absolute_path(path, sql_context))
				if dependency_vds is None:
					dependency_pds = self._find_pds_by_path(self._utils.get_absolute_path(path, sql_context))
					if dependency_pds is None:
						# Dependency could not be resolved.
						self._logger.warn("_order_vds: giving up on ordering VDS '" + self._utils.normalize_path(vds['path']) + "'. Could not resolve dependency '" + self._utils.get_absolute_path(path, sql_context) + "' Will try to process without ordering.")
						# Move VDS into unresolved list
						self._unresolved_vds.append(vds)
						self._d.vds_list.remove(vds)
						# Mark as do-not-process
						any_dependency_unresolved = True
						break
					else:
						# The dependency has been resolved as PDS, continue to the next dependency
						continue
				else:
					# Dependency was found as VDS
					dependency_hierarchy_level = self._find_vds_level_in_hierarchy(dependency_vds['id'])
					if dependency_hierarchy_level is None:
						# Dependency has not been processed yet, push this VDS to the next processing level
						vds_hierarchy_level = None
						break
					# Find the highest level of hierarchy among dependencies
					elif vds_hierarchy_level < dependency_hierarchy_level + 1:
						vds_hierarchy_level = dependency_hierarchy_level + 1
			if any_dependency_unresolved or vds_hierarchy_level is None:
				# Do not process this VDS at this recursion
				self._logger.debug("_order_vds: some dependencies cannot be validated for entity " + vds['id'] + " at processing level " + str(processing_level))
			else:
				# Add the current VDS to the vds_hierarchy_level
				self._vds_hierarchy.append([vds_hierarchy_level, vds])
				# Remove the current VDS from further processing
				self._d.vds_list.remove(vds)
				# Mark this hierarchy level as successful
				any_vds_leveled = True
				self._logger.debug("_order_vds: dependencies have been validated for entity " + vds['id'] + " for hierarchy level " + str(vds_hierarchy_level))
		# Are we done yet with recursion
		if not any_vds_leveled or len(self._d.vds_list) == 0:
			self._hierarchy_depth = processing_level + 1
			self._logger.debug("_order_vds: finished processing all VDS with hierarchy depth of :" + str(self._hierarchy_depth + 1))
			return
		# Process the next Hierarchy Level recursively
		self._order_vds(processing_level + 1)

	def _get_vds_dependency_paths(self, vds):
		if self._is_source_ce() or not self._d.vds_parents:
			# CE does not support graph
			return parse_sql.tables_in_query(vds['sql'])
		else:
			for vds_entry in self._d.vds_parents:
				if vds_entry['path'] == vds['path']:
					return vds_entry['parents']

	def _is_source_ce(self):
		for item in self._d.dremio_get_config:
			if 'source' in item:
				for param in item['source']:
					if 'is_community_edition' in param:
						return eval(param['is_community_edition'])
		return False

	def _find_vds_by_path(self, path):
		# First, try finding in the VDS list from the source file
		for vds in self._d.vds_list:
			if path == self._utils.normalize_path(vds['path']):
				return vds
		# For dry run, check processed vds
		if self._config.dry_run:
			for vds in self._dry_run_processed_vds_list:
				if path == self._utils.normalize_path(vds['path']):
					return vds
		# Finally, try finding in the target environment
		entity = self._dremio_env.get_catalog_entity_by_path(path)
		# Make sure we get VDS and not folder/file
		if entity is not None and self._utils.is_vds(entity):
			return entity
		return None

	def _find_pds_by_path(self, path):
		# First, try finding in the PDS list from the source file
		for pds in self._d.pds_list:
			if path == self._utils.normalize_path(pds['path']):
				return pds
		# For dry run, check processed pds
		if self._config.dry_run:
			for pds in self._dry_run_processed_pds_list:
				if path == self._utils.normalize_path(pds['path']):
					return pds
		# Finally, try finding in the target environment
		entity = self._dremio_env.get_catalog_entity_by_path(path)
		# Make sure we get promoted PDS and not folder/file
		if entity is not None and self._utils.is_pds(entity):
			return entity
		return None

	def _find_vds_level_in_hierarchy(self, vds_id):
		for item in self._vds_hierarchy:
			if item[1]['id'] == vds_id:
				return item[0]
		return None

	def get_errors_count(self):
		return self._logger.errors_encountered


	def _write_wiki(self, wiki, process_mode):
		self._logger.debug("_write_wiki: processing wiki: " + str(wiki))
		new_wiki_text = wiki['text']
		wiki_path = wiki['path']
		# Check if the wiki already exists
		existing_wiki_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(wiki_path))
		if existing_wiki_entity is None:
			self._logger.error("_write_wiki: Unable to resolve wiki's dataset for " + str(wiki))
			return None
		existing_wiki = self._dremio_env.get_catalog_wiki(existing_wiki_entity['id'])
		if existing_wiki is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_wiki: Skipping wiki creation due to configuration wiki_process_mode. " + str(wiki))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_wiki: Dry Run, NOT Creating wiki: " + str(wiki))
				return None
			new_wiki = {"text":new_wiki_text}
			new_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], new_wiki, self._config.dry_run)
			if new_wiki is None:
				self._logger.error("_write_wiki: could not create " + str(wiki))
				return None
		else:  # Wiki already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_wiki: Found existing wiki and wiki_process_mode is set to create_only. Skipping " + str(wiki))
				return None
			# make sure there are changes to update as it will invalidate existing wiki data
			if new_wiki_text == existing_wiki['text']:
				# Nothing to do
				self._logger.debug("_write_wiki: No pending changes. Skipping " + str(wiki))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_wiki: Dry Run, NOT Updating " + str(wiki))
				return False
			self._logger.debug("_write_wiki: Overwriting " + str(wiki))
			existing_wiki['text'] = new_wiki_text
			updated_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], existing_wiki, self._config.dry_run)
			if updated_wiki is None:
				self._logger.error("_write_wiki: Error updating " + str(wiki))
				return False
		return True


	def _write_tags(self, tags, process_mode):
		self._logger.debug("_write_tag: processing tags: " + str(tags))
		new_tags = tags['tags']
		tags_path = tags['path']
		# Check if the tags already exist
		existing_tags_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(tags_path))
		if existing_tags_entity is None:
			self._logger.error("_write_tags: Unable to resolve tag's dataset for " + str(tags))
			return None
		existing_tags = self._dremio_env.get_catalog_tags(existing_tags_entity['id'])
		if existing_tags is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_tags: Skipping tags creation due to configuration tag_process_mode. " + str(tags))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_tags: Dry Run, NOT Creating tags: " + str(tags))
				return None
			new_tags = {"tags":new_tags}
			new_tags = self._dremio_env.update_tag(existing_tags_entity['id'], new_tags, self._config.dry_run)
			if new_tags is None:
				self._logger.error("_write_tags: could not create " + str(tags))
				return None
		else:  # Tags already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_tags: Found existing tags and tag_process_mode is set to create_only. Skipping " + str(tags))
				return None
			# make sure there are changes to update as it will invalidate existing tags data
			if new_tags == existing_tags['tags']:
				# Nothing to do
				self._logger.debug("_write_tags: No pending changes. Skipping " + str(tags))
				return None
			if self._config.dry_run:
				self._logger.warn("tags: Dry Run, NOT Updating " + str(tags))
				return False
			self._logger.debug("_write_tags: Overwriting " + str(tags))
			existing_tags['tags'] = new_tags
			updated_tags = self._dremio_env.update_tag(existing_tags_entity['id'], existing_tags, self._config.dry_run)
			if updated_tags is None:
				self._logger.error("_write_tags: Error updating " + str(tags))
				return False
		return True
Пример #7
0
class DremioClonerConfig():

    # Dremio Utils
    _utils = None
    _logger = None

    CMD_GET = 'get'
    CMD_PUT = 'put'
    CMD_CASCADE_ACL = 'cascade-acl'
    CMD_DESCRIBE_JOB = 'describe-job'
    CMD_REPORT_ACL = 'report-acl'
    CMD_REPORT_REFLECTIONS = 'report-reflections'
    CMD_DELETE = 'delete-beta'

    # Config json code
    cloner_conf_json = None
    # Command to execute: put, get, cp, report-acl, cascade-acl
    command = None
    dry_run = True
    # Source Dremio Environment definition
    source_endpoint = None
    source_verify_ssl = True
    source_username = None
    source_password = None
    source_filename = None
    source_directory = None
    source_ce = False
    source_graph_support = False
    target_ce = False
    job_sql = None
    # Source Dremio Environment definition
    target_endpoint = None
    target_verify_ssl = True
    target_username = None
    target_password = None
    target_filename = None
    target_directory = None
    target_file_or_dir_overwrite = False
    target_type = None
    container_filename = "___container.json"
    dremio_conf_filename = "___dremio_cloner_conf.json"
    # Options
    max_errors = 9999
    http_timeout = 10  # seconds
    # Logging options
    logging_level = logging.INFO
    logging_format = "%(levelname)s:%(asctime)s:%(message)s"
    logging_filename = None
    logging_verbose = False
    # Processing
    user_process_mode = None  # Flag to process User: process, skip
    group_process_mode = None  # Flag to process Group: process, skip
    space_filter = None  # Filter for Space entity type
    space_filter_names = []  # List of Spaces to process if not empty
    space_exclude_filter = None  # Exclusion Filter for Space entity type
    space_cascade_acl_origin_override_object = None  # An ACL from this object will be utilized instead of the Space ACL as an ACL to set inside all Folders and VDSs in the Space
    space_folder_filter = None  # Filter for Space Folder entity type
    space_folder_exclude_filter = None  # Exclusion Filter for Space Folder entity type
    space_folder_cascade_acl_origin_filter = None  # Filter for folders that will be used as ACL origins if specified
    space_process_mode = None  # Flag to process Space: process, skip, create_only, update_only, create_overwrite
    space_ignore_missing_acl_user = False  # Flag to write a Space if an ACL user is missing in the target Dremio environment
    space_ignore_missing_acl_group = False  # Flag to write a Space if an ACL group is missing in the target Dremio environment
    source_filter = None  # Filter for Source entity type
    source_filter_names = []  # List of Sources to process if not empty
    source_filter_types = []  # List of Source Types to process if not empty
    source_exclude_filter = None  # Exclusion Filter for Source entity type
    source_cascade_acl_origin_override_object = None  # An ACL from this object will be utilized instead of the Source ACL as an ACL to set inside all PDS in the Source
    source_folder_filter = None  # Filter for Source Folder entity type
    source_folder_exclude_filter = None  # Exclusion Filter for Source Folder entity type
    source_process_mode = None  # Flag to process Sources: process, skip, create_only, update_only, create_overwrite
    source_ignore_missing_acl_user = False  # Flag to write a Source if an ACL user is missing in the target Dremio environment
    source_ignore_missing_acl_group = False  # Flag to write a Source if an ACL group is missing in the target Dremio environment
    source_retry_timedout = False  # Flag to retry Sources that timed out
    folder_process_mode = None  # Flag to process Folder: process, skip, create_only, update_only, create_overwrite
    folder_ignore_missing_acl_user = False  # Flag to write a Folder if an ACL user is missing in the target Dremio environment
    folder_ignore_missing_acl_group = False  # Flag to write a Folder if an ACL group is missing in the target Dremio environment
    pds_list_useapi = False  # Using API for listing PDS may cause issues when the source is not available at the runtime
    pds_filter = None  # Filter for PDS
    pds_exclude_filter = None  # Exclusion Filter for PDS
    pds_process_mode = None  # Flag to process Source PDS: process, skip, promote
    pds_ignore_missing_acl_user = False  # Flag to write a Source PDS if an ACL user is missing in the target Dremio environment
    pds_ignore_missing_acl_group = False  # Flag to write a Source PDS if an ACL group is missing in the target Dremio environment
    vds_filter = None  # Filter for VDS
    vds_filter_tag = None  # Filter for VDS
    vds_exclude_filter = None  # Exclusion Filter for VDS
    vds_process_mode = None  # Flag to process VDS: process, skip, create_only, update_only, create_overwrite
    vds_dependencies_process_mode = 'ignore'  # Flag to process VDS dependencies (VDS and PDS): ignore, get
    vds_ignore_missing_acl_user = False  # Flag to write a VDS if an ACL user is missing in the target Dremio environment
    vds_ignore_missing_acl_group = False  # Flag to write a VDS if an ACL group is missing in the target Dremio environment
    vds_max_hierarchy_depth = 100  # The max hierarchy depth to process
    reflection_process_mode = None  # Flag to process reflection: process, skip, create_only, update_only, create_overwrite
    reflection_filter_mode = None  # Flag to filter reflection: apply_vds_pds_filter
    reflection_refresh_mode = 'skip'  # Flag to refresh reflections: refresh, skip
    wlm_queue_process_mode = 'process'  # Flag to process WLM Queues: process, skip
    wlm_rule_process_mode = 'process'  # Flag to process WLM Rules: process, skip
    wiki_process_mode = 'process'  # Flag to process Wikis: process, skip, create_only, update_only, create_overwrite
    tag_process_mode = 'process'  # Flag to process Tags: process, skip
    home_process_mode = 'process'  # Flag to process Homes: process, skip
    vote_process_mode = 'process'  # Flag to process Votes: process, skip
    acl_transformation = {}  # Contains all ACL tranformation definitions
    # Delete VDS List
    delete_vds = []  # List of VDS to delete from the target environment
    delete_folders = [
    ]  # List of Folders to delete from the target environment

    # Report options
    report_csv_delimiter = "\t"
    report_csv_newline = "\n"
    # Misc options
    # Compiled filters
    _space_filter_re = None
    _space_exclude_filter_re = None
    _space_folder_filter_re = None
    _space_folder_exclude_filter_re = None
    _space_folder_cascade_acl_origin_filter_re = None
    _source_filter_re = None
    _source_exluce_filter_re = None
    _source_folder_filter_re = None
    _source_folder_exclude_filter_re = None
    _pds_filter_re = None
    _pds_exclude_filter_re = None
    _vds_filter_re = None
    _vds_exclude_filter_re = None

    def __init__(self, config_file_name):
        # Read configuration file
        f = open(config_file_name, "r", encoding="utf-8")
        self.cloner_conf_json = json.load(f)['dremio_cloner']
        f.close()
        for element in self.cloner_conf_json:
            if 'command' in element:
                self._process_command(element)
            elif 'source' in element:
                self._process_source(element)
            elif 'target' in element:
                self._process_target(element)
            elif 'options' in element:
                self._process_options(element)
        logging.basicConfig(format=self.logging_format,
                            level=self.logging_level,
                            filename=self.logging_filename)
        self._logger = DremioClonerLogger(self.max_errors,
                                          self.logging_verbose)
        self._validate_configuration()

    def _process_command(self, json_conf):
        self.command = json_conf['command']

    def _process_target(self, json_conf):
        for item in json_conf['target']:
            if 'endpoint' in item:
                self.target_endpoint = item['endpoint']
            elif 'username' in item:
                self.target_username = item['username']
            elif 'password' in item:
                self.target_password = item['password']
            elif 'filename' in item:
                self.target_filename = item['filename']
            elif 'directory' in item:
                self.target_directory = item['directory']
            elif 'overwrite' in item:
                self.target_file_or_dir_overwrite = self._bool(
                    item, 'overwrite')
            elif 'verify_ssl' in item:
                self.target_verify_ssl = self._bool(item, 'verify_ssl')
            elif 'is_community_edition' in item:
                self.target_ce = self._bool(item, 'is_community_edition')
            elif 'target.type' in item:
                self.target_type = self._str(item, 'target.type')

    def _process_source(self, json_conf):
        for item in json_conf['source']:
            if 'endpoint' in item:
                self.source_endpoint = item['endpoint']
            elif 'username' in item:
                self.source_username = item['username']
            elif 'password' in item:
                self.source_password = item['password']
            elif 'filename' in item:
                self.source_filename = item['filename']
            elif 'directory' in item:
                self.source_directory = item['directory']
            elif 'verify_ssl' in item:
                self.source_verify_ssl = self._bool(item, 'verify_ssl')
            elif 'is_community_edition' in item:
                self.source_ce = self._bool(item, 'is_community_edition')
            elif 'graph_api_support' in item:
                self.source_graph_support = self._bool(item,
                                                       'graph_api_support')
            elif 'job-sql' in item:
                self.job_sql = self._str(item, 'job-sql')

    def _process_options(self, json_conf):
        for item in json_conf['options']:
            if 'dry_run' in item:
                self.dry_run = self._bool(item, 'dry_run')
            elif 'max_errors' in item:
                self.max_errors = self._eval(item, 'max_errors')
            elif 'logging.level' in item:
                self.logging_level = self._eval(item, 'logging.level')
            elif 'logging.format' in item:
                self.logging_format = self._str(item, 'logging.format')
            elif 'logging.filename' in item:
                self.logging_filename = self._str(item, 'logging.filename')
            elif 'logging.verbose' in item:
                self.logging_verbose = self._bool(item, 'logging.verbose')
            elif 'http_timeout' in item:
                self.http_timeout = self._int(item, 'http_timeout')
            elif 'user.process_mode' in item:
                self.user_process_mode = self._str(item, 'user.process_mode')
            elif 'group.process_mode' in item:
                self.group_process_mode = self._str(item, 'group.process_mode')
            elif 'space.process_mode' in item:
                self.space_process_mode = self._str(item, 'space.process_mode')
            elif 'space.filter' in item:
                self.space_filter = self._str(item, 'space.filter')
                self._space_filter_re = self._compile_pattern(
                    self.space_filter)
            elif 'space.filter.names' in item:
                self.space_filter_names = self._array(item,
                                                      'space.filter.names')
            elif 'space.exclude.filter' in item:
                self.space_exclude_filter = self._str(item,
                                                      'space.exclude.filter')
                self._space_exclude_filter_re = self._compile_pattern(
                    self.space_exclude_filter)
            elif 'space.cascade-acl-origin.override-object' in item:
                self.space_cascade_acl_origin_override_object = self._str(
                    item, 'space.cascade-acl-origin.override-object')
            elif 'space.folder.filter' in item:
                self.space_folder_filter = self._str(item,
                                                     'space.folder.filter')
                self._space_folder_filter_re = self._compile_pattern(
                    self.space_folder_filter)
            elif 'space.folder.exclude.filter' in item:
                self.space_folder_exclude_filter = self._str(
                    item, 'space.folder.exclude.filter')
                self._space_folder_exclude_filter_re = self._compile_pattern(
                    self.space_folder_exclude_filter)
            elif 'space.folder.cascade-acl-origin.filter' in item:
                self.space_folder_cascade_acl_origin_filter = self._str(
                    item, 'space.folder.cascade-acl-origin.filter')
                self._space_folder_cascade_acl_origin_filter_re = self._compile_pattern(
                    self.space_folder_cascade_acl_origin_filter)
            elif 'space.ignore_missing_acl_user' in item:
                self.space_ignore_missing_acl_user = self._bool(
                    item, 'space.ignore_missing_acl_user')
            elif 'space.ignore_missing_acl_group' in item:
                self.space_ignore_missing_acl_group = self._bool(
                    item, 'space.ignore_missing_acl_group')
            elif 'source.process_mode' in item:
                self.source_process_mode = self._str(item,
                                                     'source.process_mode')
            elif 'source.filter.names' in item:
                self.source_filter_names = self._array(item,
                                                       'source.filter.names')
            elif 'source.filter.types' in item:
                self.source_filter_types = self._array(item,
                                                       'source.filter.types')
            elif 'source.filter' in item:
                self.source_filter = self._str(item, 'source.filter')
                self._source_filter_re = self._compile_pattern(
                    self.source_filter)
            elif 'source.exclude.filter' in item:
                self.source_exclude_filter = self._str(
                    item, 'source.exclude.filter')
                self._source_exclude_filter_re = self._compile_pattern(
                    self.source_exclude_filter)
            elif 'source.folder.filter' in item:
                self.source_folder_filter = self._str(item,
                                                      'source.folder.filter')
                self._source_folder_filter_re = self._compile_pattern(
                    self.source_folder_filter)
            elif 'source.cascade-acl-origin.override-object' in item:
                self.source_cascade_acl_origin_override_object = self._str(
                    item, 'source.cascade-acl-origin.override-object')
            elif 'source.folder.exclude.filter' in item:
                self.source_folder_exclude_filter = self._str(
                    item, 'source.folder.exclude.filter')
                self._source_folder_exclude_filter_re = self._compile_pattern(
                    self.source_folder_exclude_filter)
            elif 'source.ignore_missing_acl_user' in item:
                self.source_ignore_missing_acl_user = self._bool(
                    item, 'source.ignore_missing_acl_user')
            elif 'source.ignore_missing_acl_group' in item:
                self.source_ignore_missing_acl_group = self._bool(
                    item, 'source.ignore_missing_acl_group')
            elif 'source.retry_timedout' in item:
                self.source_retry_timedout = self._bool(
                    item, 'source.retry_timedout')
            elif 'folder.process_mode' in item:
                self.folder_process_mode = self._str(item,
                                                     'folder.process_mode')
            elif 'folder.ignore_missing_acl_user' in item:
                self.folder_ignore_missing_acl_user = self._bool(
                    item, 'folder.ignore_missing_acl_user')
            elif 'folder.ignore_missing_acl_group' in item:
                self.folder_ignore_missing_acl_group = self._bool(
                    item, 'folder.ignore_missing_acl_group')
            elif 'pds.process_mode' in item:
                self.pds_process_mode = self._str(item, 'pds.process_mode')
            elif 'pds.list.useapi' in item:
                self.pds_list_useapi = self._bool(item, 'pds.list.useapi')
            elif 'pds.filter' in item:
                self.pds_filter = self._str(item, 'pds.filter')
                self._pds_filter_re = self._compile_pattern(self.pds_filter)
            elif 'pds.exclude.filter' in item:
                self.pds_exclude_filter = self._str(item, 'pds.exclude.filter')
                self._pds_exclude_filter_re = self._compile_pattern(
                    self.pds_exclude_filter)
            elif 'pds.ignore_missing_acl_user' in item:
                self.pds_ignore_missing_acl_user = self._bool(
                    item, 'pds.ignore_missing_acl_user')
            elif 'pds.ignore_missing_acl_group' in item:
                self.pds_ignore_missing_acl_group = self._bool(
                    item, 'pds.ignore_missing_acl_group')
            elif 'vds.process_mode' in item:
                self.vds_process_mode = self._str(item, 'vds.process_mode')
            elif 'vds.dependencies.process_mode' in item:
                self.vds_dependencies_process_mode = self._str(
                    item, 'vds.dependencies.process_mode')
            elif 'vds.filter' in item:
                self.vds_filter = self._str(item, 'vds.filter')
                self._vds_filter_re = self._compile_pattern(self.vds_filter)
            elif 'vds.filter.tag' in item:
                self.vds_filter_tag = self._str(item, 'vds.filter.tag')
            elif 'vds.exclude.filter' in item:
                self.vds_exclude_filter = self._str(item, 'vds.exclude.filter')
                self._vds_exclude_filter_re = self._compile_pattern(
                    self.vds_exclude_filter)
            elif 'vds.ignore_missing_acl_user' in item:
                self.vds_ignore_missing_acl_user = self._bool(
                    item, 'vds.ignore_missing_acl_user')
            elif 'vds.ignore_missing_acl_group' in item:
                self.vds_ignore_missing_acl_group = self._bool(
                    item, 'vds.ignore_missing_acl_group')
            elif 'vds.max_hierarchy_depth' in item:
                self.vds_max_hierarchy_depth = self._bool(
                    item, 'vds.max_hierarchy_depth')
            # Reflection options
            elif 'reflection.process_mode' in item:
                self.reflection_process_mode = self._str(
                    item, 'reflection.process_mode')
            elif 'reflection.filter_mode' in item:
                self.reflection_filter_mode = self._str(
                    item, 'reflection.filter_mode')
            elif 'pds.reflection_refresh_mode' in item:
                self.reflection_refresh_mode = self._str(
                    item, 'pds.reflection_refresh_mode')
            # Report Options
            elif 'report.csv.delimiter' in item:
                self.report_csv_delimiter = self._str(item,
                                                      'report.csv.delimiter')
            elif 'report.csv.newline' in item:
                self.report_csv_newline = self._str(item, 'report.csv.newline')
            # Misc options
            elif 'wlm.queue.process_mode' in item:
                self.wlm_queue_process_mode = self._str(
                    item, 'wlm.queue.process_mode')
            elif 'wlm.rule.process_mode' in item:
                self.wlm_rule_process_mode = self._str(
                    item, 'wlm.rule.process_mode')
            elif 'wiki.process_mode' in item:
                self.wiki_process_mode = self._str(item, 'wiki.process_mode')
            elif 'tag.process_mode' in item:
                self.tag_process_mode = self._str(item, 'tag.process_mode')
            elif 'home.process_mode' in item:
                self.home_process_mode = self._str(item, 'home.process_mode')
            elif 'vote.process_mode' in item:
                self.vote_process_mode = self._str(item, 'vote.process_mode')
            elif 'transformation' in item:
                acl_transformation_filename = self._str(
                    item['transformation']['acl'], 'file')
                f = open(acl_transformation_filename, "r")
                self.acl_transformation = json.load(f)['acl-transformation']
                f.close()
            elif 'vds.delete_list' in item:
                self.delete_vds = self._str_array(item, 'vds.delete_list')
            elif 'folder.delete_list' in item:
                self.delete_folders = self._str_array(item,
                                                      'folder.delete_list')

    def _validate_configuration(self):
        if (self.command is None):
            self._logger.fatal("missing 'command' entry.")
        elif self.command == self.CMD_GET and (
                self.source_endpoint is None or self.source_username is None
                or self.source_password is None or
            (self.target_filename is None and self.target_directory is None)):
            self._logger.fatal("Invalid configuration for command 'get'.")
        elif self.command == self.CMD_PUT and (
            (self.source_filename is None and self.source_directory is None)
                or self.target_endpoint is None or self.target_username is None
                or self.target_password is None):
            self._logger.fatal("Invalid configuration for command 'get'.")
        elif self.command == self.CMD_REPORT_ACL and (
                self.source_endpoint is None or self.source_username is None or
                self.source_password is None or self.target_filename is None):
            self._logger.fatal(
                "Invalid configuration for command 'report-acl'.")

        if (self.command == self.CMD_PUT
                and (self.space_process_mode is None or
                     (self.space_process_mode != 'skip'
                      and self.space_process_mode != 'update_only'
                      and self.space_process_mode != 'create_only'
                      and self.space_process_mode != 'create_overwrite'))):
            self._logger.fatal("Invalid configuration for space.process_mode.")
        if (self.command == self.CMD_PUT
                and (self.source_process_mode is None or
                     (self.source_process_mode != 'skip'
                      and self.source_process_mode != 'update_only'
                      and self.source_process_mode != 'create_only'
                      and self.source_process_mode != 'create_overwrite'))):
            self._logger.fatal(
                "Invalid configuration for source.process_mode.")
        if (self.command == self.CMD_PUT
                and (self.pds_process_mode is None or
                     (self.pds_process_mode != 'skip'
                      and self.pds_process_mode != 'promote'))):
            self._logger.fatal("Invalid configuration for pds.process_mode.")
        if (self.command == self.CMD_PUT
                and (self.vds_process_mode is None or
                     (self.vds_process_mode != 'skip'
                      and self.vds_process_mode != 'update_only'
                      and self.vds_process_mode != 'create_only'
                      and self.vds_process_mode != 'create_overwrite'))):
            self._logger.fatal("Invalid configuration for vds.process_mode.")
        # Make sure we do not overwrite JSON environment file
        if (self.command == self.CMD_GET and self.target_filename is not None
                and not self.target_file_or_dir_overwrite
                and os.path.isfile(self.target_filename)):
            self._logger.fatal("File " + str(self.target_filename) +
                               " already exists. Cannot overwrite.")
        if (self.command == self.CMD_GET and self.target_directory is not None
                and not self.target_file_or_dir_overwrite
                and os.path.isdir(self.target_directory)):
            self._logger.fatal("Directory " + str(self.target_directory) +
                               " already exists. Cannot overwrite.")
        if (self.command == self.CMD_REPORT_ACL
                and os.path.isfile(self.target_filename)):
            self._logger.fatal("File " + str(self.target_filename) +
                               " already exists. Cannot overwrite.")

    def _bool(self, conf, param_name):
        if (param_name in conf):
            try:
                return eval(conf[param_name].title())
            except NameError:
                self._logger.fatal("Invalid boolean value for parameter " +
                                   param_name)
        else:
            return None

    def _array(self, conf, param_name):
        if (param_name in conf):
            try:
                return conf[param_name]
            except:
                self._logger.fatal("Invalid array value for parameter " +
                                   param_name)
        else:
            return None

    def _int(self, conf, param_name):
        if (param_name in conf):
            try:
                return int(conf[param_name])
            except:
                self._logger.fatal("Invalid integer value for parameter " +
                                   param_name)
        else:
            return None

    def _str(self, conf, param_name):
        if (param_name in conf and not conf[param_name] == ""):
            return conf[param_name]
        return None

    def _str_array(self, conf, param_name):
        if (param_name in conf and not conf[param_name] == ""):
            return conf[param_name]
        return None

    def _eval(self, conf, param_name):
        if (param_name in conf):
            try:
                return eval(conf[param_name])
            except:
                self._logger.fatal("Invalid value for parameter " + param_name)
        else:
            return None

    def _compile_pattern(self, pattern):
        if pattern is None:
            return None
        return re.compile(fnmatch.translate(pattern))
Пример #8
0
class DremioReader:

    # Dremio Cloner Configuration, Utils, ...
    _config = None
    _utils = None
    _logger = None
    _filter = None

    # Dremio object pointing to the source Dremio environment
    _dremio_env = None

    # DremioData object containing data from Dremio source environment
    _d = DremioData()

    # Current top-level hierarchy context: Home, Space, Source
    _top_level_hierarchy_context = None

    def __init__(self, source_dremio, config):
        self._config = config
        self._dremio_env = source_dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)
        self._filter = DremioClonerFilter(config)

    # Read all data from the source Dremio environemnt
    # Return DremioData
    def read_dremio_environment(self):
        self._read_catalog()
        if not self._config.pds_list_useapi and self._filter.is_pds_in_scope():
            self._read_all_pds()
        self._read_reflections()
        self._read_rules()
        self._read_queues()
        self._read_votes()
        # Make sure that all VDS dependencies included as per configuration
        self._process_vds_dependencies()
        return self._d

    def _read_all_pds(self):
        if self._config.pds_list_useapi or not self._filter.is_pds_in_scope():
            self._logger.info(
                "_read_all_pds: skipping PDS reading as per pds.filter configuration."
            )
        else:
            pds_list = self._dremio_env.list_pds(
                self._d.sources,
                self._config.source_folder_filter,
                self._config.source_folder_exclude_filter,
                self._config.pds_filter,
                self._config.pds_exclude_filter,
                pds_error_list=self._d.pds_error_list)
            for pds in pds_list:
                if self._filter.match_pds_filter(pds):
                    self._d.pds_list.append(pds)

    # Read Dremio catalog from source environment recursively going to containers and their children objects
    def _read_catalog(self):
        containers = self._dremio_env.list_catalog()['data']
        for container in containers:
            self._logger.debug("_read_catalog: processing container " +
                               self._utils.get_entity_desc(container))
            self._process_container(container)

    # Identify a container and delegate processing
    def _process_container(self, container):
        self._logger.debug("_process_container: " +
                           self._utils.get_entity_desc(container))
        if container['containerType'] == "HOME":
            self._read_home(container)
        elif container['containerType'] == "SPACE":
            self._read_space(container)
        elif container['containerType'] == "SOURCE":
            self._read_source(container)
        else:
            self._logger.fatal("_process_container: unexpected entity type " +
                               self._utils.get_entity_desc(container))

    def _read_home(self, container):
        self._logger.debug("_read_home: processing container: " +
                           self._utils.get_entity_desc(container))
        if self._config.home_process_mode == 'process':
            self._top_level_hierarchy_context = "HOME"
            self._d.containers.append(container)
            entity = self._get_entity_definition_by_id(container)
            if entity is not None:
                self._logger.info("_read_home: " +
                                  self._utils.get_entity_desc(entity))
                self._d.homes.append(entity)
                self._read_acl(entity)
                self._read_wiki(entity)
                self._read_space_children(entity)
            else:
                self._logger.error(
                    "_read_home: error reading entity for container: " +
                    self._utils.get_entity_desc(container))
        else:
            self._logger.debug("_read_home: skipping due to job configuration")

    def _read_space(self, container):
        self._logger.debug("_read_space: processing container: " +
                           self._utils.get_entity_desc(container))
        self._top_level_hierarchy_context = "SPACE"
        if self._filter.match_space_filter(container):
            self._d.containers.append(container)
            entity = self._get_entity_definition_by_id(container)
            if entity is not None:
                self._logger.debug("_read_space: " +
                                   self._utils.get_entity_desc(container))
                self._d.spaces.append(entity)
                self._read_acl(entity)
                self._read_wiki(entity)
                self._read_space_children(entity)
            else:
                self._logger.error(
                    "_read_space: error reading entity for container: " +
                    self._utils.get_entity_desc(container))

    def _read_source(self, container):
        self._logger.debug("_read_source: processing container: " +
                           self._utils.get_entity_desc(container))
        if self._config.source_process_mode == 'process' or (
                self._config.pds_process_mode == 'process'
                and self._config.pds_list_useapi):
            self._top_level_hierarchy_context = "SOURCE"
            if self._filter.match_source_filter(container):
                self._d.containers.append(container)
                entity = self._get_entity_definition_by_id(container)
                if entity is not None:
                    # Re-validate the filter with entity since there is more details in entity
                    if self._filter.match_source_filter(entity):
                        self._logger.debug("_read_source: " +
                                           self._utils.get_entity_desc(entity))
                        self._d.sources.append(entity)
                        self._read_acl(entity)
                        self._read_wiki(entity)
                        # Depending on the useapi flag, PDSs can be collected via INFORMATION_SCHEMA. See also DX16597
                        if self._config.pds_list_useapi:
                            self._read_source_children(entity)
                else:
                    self._logger.error(
                        "_read_source: error reading entity for container: " +
                        self._utils.get_entity_desc(container))
        else:
            self._logger.debug(
                "_read_source: skipping due to job configuration")

    def _read_space_folder(self, folder):
        self._logger.debug("_read_space_folder: processing folder: " +
                           self._utils.get_entity_desc(folder))
        if self._top_level_hierarchy_context not in ["SPACE", "HOME"]:
            return
        entity = self._get_entity_definition_by_id(folder)
        if entity is None:
            self._logger.error(
                "_read_space_folder: error reading entity for folder: " +
                self._utils.get_entity_desc(folder))
            return
        if self._top_level_hierarchy_context == "HOME" or self._filter.match_space_folder_filter(
                folder):
            self._logger.debug("_read_space_folder: " +
                               self._utils.get_entity_desc(folder))
            self._d.folders.append(entity)
            self._read_acl(entity)
            self._read_wiki(entity)
            # Validate all parent folders in the path have been saved already
            folder_path = entity['path']
            for i in range(1, len(folder_path) - 1):
                folderSaved = False
                for item in self._d.folders:
                    if item['path'][-1] == folder_path[i]:
                        folderSaved = True
                if not folderSaved:
                    parent_entity = self._get_entity_definition_by_path(
                        folder_path[0:i + 1])
                    self._d.folders.append(parent_entity)
        self._read_space_children(entity)

    def _read_space_children(self, parent_entity):
        self._logger.debug("_read_space_children: processing parent_entity: " +
                           self._utils.get_entity_desc(parent_entity))
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_read_space_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        for child in parent_entity['children']:
            if child['type'] == "DATASET":
                self._read_dataset(child)
            elif child['type'] == "FILE":
                self._read_file(child)
            elif child['containerType'] == "FOLDER":
                self._read_space_folder(child)
            else:
                self._logger.error(
                    "_read_space_children: not supported entity type " +
                    child['type'])

    def _read_source_folder(self, folder):
        self._logger.debug("_read_source_folder: processing folder: " +
                           self._utils.get_entity_desc(folder))
        if self._top_level_hierarchy_context == "SOURCE" and self._filter.match_source_folder_filter(
                folder):
            entity = self._get_entity_definition_by_id(folder)
            if entity is not None:
                self._logger.debug("_read_source_folder: " +
                                   self._utils.get_entity_desc(folder))
                self._read_source_children(entity)
            else:
                self._logger.error(
                    "_read_source_folder: error reading entity for folder: " +
                    self._utils.get_entity_desc(folder))

    def _read_source_children(self, parent_entity):
        self._logger.debug(
            "_read_source_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_read_source_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        for child in parent_entity['children']:
            if child['type'] == "DATASET":
                self._read_dataset(child)
            elif child['type'] == "FILE":
                self._read_file(child)
            elif child['containerType'] == "FOLDER":
                self._read_source_folder(child)
            else:
                self._logger.error(
                    "_read_source_children: not supported entity type " +
                    child['type'])

    def _read_dataset(self, dataset):
        self._logger.debug("_read_dataset: processing dataset: " +
                           self._utils.get_entity_desc(dataset))
        entity = self._get_entity_definition_by_id(dataset)
        if entity is not None:
            self._logger.debug("_read_dataset: " + dataset['datasetType'] +
                               " : " + self._utils.get_entity_desc(dataset))
            if dataset['datasetType'] == "PROMOTED" or dataset[
                    'datasetType'] == "DIRECT":
                self._d.pds_list.append(entity)
            elif dataset['datasetType'] == "VIRTUAL":
                tags = self._dremio_env.get_catalog_tags(entity['id'])
                if self._filter.match_vds_filter(dataset, tags=tags):
                    self._d.vds_list.append(entity)
            else:
                self._logger.error("_read_dataset: Unexpected dataset type " +
                                   dataset['datasetType'] + " for " +
                                   self._utils.get_entity_desc(dataset) + ".")
            self._read_acl(entity)
            self._read_wiki(entity)
            self._read_tags(entity)

    def _read_file(self, file_name):
        # do nothing
        return

    def _read_reflections(self):
        self._logger.debug("_read_reflections: starting")
        if self._config.reflection_process_mode == 'process' and not self._config.source_ce:
            reflections = self._dremio_env.list_reflections()['data']
            for reflection in reflections:
                reflection_dataset = self._dremio_env.get_catalog_entity_by_id(
                    reflection['datasetId'])
                if reflection_dataset is None:
                    self._logger.debug(
                        "_read_reflections: error processing reflection, cannot get path for dataset: "
                        + reflection['datasetId'])
                    continue
                reflection_path = reflection_dataset['path']
                self._logger.debug(
                    "_read_reflections: processing reflection " +
                    reflection['datasetId'] + " path: " + str(reflection_path))
                reflection["path"] = reflection_path
                self._d.reflections.append(reflection)
#				self._read_acl(reflection)
#				self._read_wiki(reflection)
        else:
            self._logger.debug(
                "_read_reflections: skipping reflections processing as per job configuration"
            )

    # Note, tags are only available for datasets
    def _read_tags(self, entity):
        self._logger.debug("_read_tags: for entity " +
                           self._utils.get_entity_desc(entity))
        if self._config.tag_process_mode == 'process':
            tag = self._dremio_env.get_catalog_tags(entity['id'])
            if tag is not None:
                tag['entity_id'] = entity['id']
                if entity['entityType'] == 'space' or entity[
                        'entityType'] == 'source':
                    tag['path'] = [entity['name']]
                else:
                    tag['path'] = entity['path']
                if tag not in self._d.tags:
                    self._d.tags.append(tag)
        else:
            self._logger.debug(
                "_read_tags: skipping tags processing as per job configuration"
            )

    def _read_wiki(self, entity):
        self._logger.debug("_read_wiki: for entity " +
                           self._utils.get_entity_desc(entity))
        if self._config.wiki_process_mode == 'process':
            wiki = self._dremio_env.get_catalog_wiki(entity['id'])
            if wiki is not None:
                wiki["entity_id"] = entity['id']
                if entity['entityType'] == 'space' or entity[
                        'entityType'] == 'source' or entity[
                            'entityType'] == 'home':
                    wiki['path'] = [entity['name']]
                else:
                    wiki['path'] = entity['path']
                if wiki not in self._d.wikis:
                    self._d.wikis.append(wiki)
        else:
            self._logger.debug(
                "_read_wiki: skipping wiki processing as per job configuration"
            )

    def _read_acl(self, entity):
        self._logger.debug("_read_acl: for entity " +
                           self._utils.get_entity_desc(entity))
        if 'accessControlList' in entity:
            acl = entity['accessControlList']
            if 'users' in acl:
                for user in acl['users']:
                    user_entity = self._dremio_env.get_user(user['id'])
                    if user_entity is not None:
                        if user_entity not in self._d.referenced_users:
                            self._d.referenced_users.append(user_entity)
            if 'groups' in acl:
                for group in acl['groups']:
                    group_entity = self._dremio_env.get_group(group['id'])
                    if group_entity is not None:
                        if group_entity not in self._d.referenced_groups:
                            self._d.referenced_groups.append(group_entity)

    def _process_vds_dependencies(self):
        if self._config.vds_dependencies_process_mode == 'get':
            for vds in self._d.vds_list:
                self._discover_dependencies(vds)
            for vds in self._d.vds_list:
                self._populate_dependencies_graph(vds)

    # Discovers dependencies for the passed dataset and adds them to the self._d.vds_list
    def _discover_dependencies(self, dataset):
        self._logger.debug("_discover_dependencies: processing dataset: " +
                           self._utils.get_entity_desc(dataset))
        if dataset is not None:
            if 'type' not in dataset:
                self._logger.error(
                    "_discover_dependencies: Expected Dataset Entity but got: "
                    + self._utils.get_entity_desc(dataset))
                return
            if dataset['type'] == 'PHYSICAL_DATASET':
                if dataset not in self._d.pds_list:
                    self._d.pds_list.append(dataset)
                return
            elif dataset['type'] == 'VIRTUAL_DATASET':
                if dataset not in self._d.vds_list:
                    self._d.vds_list.append(dataset)
                # Process VDS dependencies
                sql_dependency_paths = self._get_vds_dependency_paths(dataset)
                for dependency_path in sql_dependency_paths:
                    dependency_path = self._utils.get_absolute_path(
                        dependency_path, self._utils.get_sql_context(dataset))
                    entity = self._find_entity(dependency_path)
                    if entity is not None:
                        # Entity has already been read
                        return
                    dependency_dataset = self._dremio_env.get_catalog_entity_by_path(
                        dependency_path)
                    if dependency_dataset is None:
                        self._logger.warn(
                            "_discover_dependencies: unable to resolve dataset likely due to datasource availability: "
                            + dependency_path)
                    else:
                        self._discover_dependencies(dependency_dataset)
            else:
                self._logger.error(
                    "_discover_dependencies: Unknown Entity Type: " +
                    dataset['type'])
        else:
            self._logger.error(
                "_discover_dependencies: Could not resolve dependency: None")

    def _populate_dependencies_graph(self, vds):
        self._logger.debug("_populate_dependencies_graph: processing vds: " +
                           self._utils.get_entity_desc(vds))
        # For some broken VDSs,
        vds_parent_list = self._get_vds_dependency_paths(vds)
        vds_parent_json = {
            'id': vds['id'],
            'path': vds['path'],
            'parents': vds_parent_list
        }
        if not self._config.source_ce and self._config.source_graph_support:
            self._d.vds_parents.append(vds_parent_json)

    def _get_vds_dependency_paths(self, vds):
        self._logger.debug("_get_vds_dependency_paths: processing vds: " +
                           self._utils.get_entity_desc(vds))
        if self._config.source_ce or not self._config.source_graph_support:
            return parse_sql.tables_in_query(vds['sql'])
        else:
            graph = self._dremio_env.get_catalog_entity_graph_by_id(vds['id'])
            if graph is None:
                self._logger.warn(
                    "Could not receive Graph via API. Try to set graph_api_support to False in the job configuration."
                )
                return parse_sql.tables_in_query(vds['sql'])
            vds_parent_list = []
            for parent in graph['parents']:
                vds_parent_list.append(
                    self._utils.normalize_path(parent['path']))
            return vds_parent_list

    def _find_entity(self, path):
        self._logger.debug("_find_entity: processing path: " + str(path))
        for vds in self._d.vds_list:
            if self._utils.normalize_path(vds['path']) == path:
                return vds
        for pds in self._d.pds_list:
            if self._utils.normalize_path(pds['path']) == path:
                return pds

    # Helper method, used by most read* methods
    def _get_entity_definition_by_id(self, src):
        self._logger.debug("_get_entity_definition_by_id: processing src: " +
                           self._utils.get_entity_desc(src))
        if 'id' not in src:
            self._logger.error(
                "_read_entity_definition: bad data, skipping entity: " +
                self._utils.get_entity_desc(src))
            return None
        else:
            entity = self._dremio_env.get_catalog_entity_by_id(src['id'])
            if entity is None:
                self._logger.error(
                    "_read_entity_definition: cannot retrieve entity for id: "
                    + src['id'])
            return entity

    def _get_entity_definition_by_path(self, path):
        self._logger.debug(
            "_get_entity_definition_by_path: processing path: " + str(path))
        path = self._utils.normalize_path(path)
        entity = self._dremio_env.get_catalog_entity_by_path(path)
        if entity is None:
            self._logger.error(
                "_read_entity_definition: cannot retrieve entity for path: " +
                str(path))
        return entity

    def _read_queues(self):
        self._logger.debug("read_queues: started")
        if self._config.wlm_queue_process_mode == 'process' and not self._config.source_ce:
            self._d.queues = self._dremio_env.list_queues()['data']
        else:
            self._logger.debug(
                "_read_queues: skipping as per job configuration")

    def _read_rules(self):
        self._logger.debug("read_rules: started")
        if self._config.wlm_rule_process_mode == 'process' and not self._config.source_ce:
            self._d.rules = self._dremio_env.list_rules()['rules']
        else:
            self._logger.debug("read_rules: skipping as per job configuration")

    def _read_votes(self):
        self._logger.debug("read_votes: started")
        if self._config.vote_process_mode == 'process' and not self._config.source_ce:
            self._d.votes = self._dremio_env.list_votes()['data']
        else:
            self._logger.debug("read_votes: skipping as per job configuration")

    def get_errors_count(self):
        return self._logger.errors_encountered
Пример #9
0
class DremioCascadeAcl:

    # Dremio Cloner Config, Logger, Utils
    _config = None
    _logger = None
    _utils = None
    _filter = None

    # Dremio Environment to write to
    _dremio_env = None

    # List of PDS for processing
    _pds_list = None

    def __init__(self, dremio, config):
        self._config = config
        self._dremio_env = dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)
        self._filter = DremioClonerFilter(config)

    def cascade_acl(self):
        if not self._config.pds_list_useapi:
            self._pds_list = self._dremio_env.list_pds(
                self._config.source_filter, self._config.source_exclude_filter,
                self._config.source_folder_filter,
                self._config.source_folder_exclude_filter,
                self._config.pds_filter, self._config.pds_exclude_filter)
            self._logger.info(
                "cascade_acl: Not using API for PDS retrieval. Filtered PDS are NOT reported in the log."
            )
        containers = self._dremio_env.list_catalog()['data']
        for container in containers:
            self._logger.debug("cascade_acl: processing container " +
                               self._utils.get_entity_desc(container))
            if container[
                    'containerType'] == "SPACE" and self._filter.match_space_filter(
                        container):
                self._process_space(container)
            elif container[
                    'containerType'] == "SOURCE" and self._filter.match_source_filter(
                        container):
                self._process_source(container)

    def _process_space(self, space):
        entity = self._get_entity_definition(space)
        if entity is None:
            self._logger.error(
                "_process_space: error reading entity for container: " +
                self._utils.get_entity_desc(space))
        else:
            if self._config.space_cascade_acl_origin_override_object is None:
                # Use Space ACL as an 'origin'
                self._logger.info(
                    "_process_space: SPACE: '" + str(space['path']) +
                    "' will be used as an ACL Origin for its children FOLDERs and VDSs."
                )
                acl = self._get_acl(entity)
            else:
                # Use ACL from a configured object
                acl_entity = self._dremio_env.get_catalog_entity_by_path(
                    self._config.space_cascade_acl_origin_override_object)
                if acl_entity is None:
                    self._logger.error(
                        "_process_space: error reading origin entity for path: "
                        + str(self._config.
                              space_cascade_acl_origin_override_object))
                    return
                self._logger.info(
                    "_process_space: SPACE: '" + str(space['path']) +
                    "' Using override origin instead as an ACL Origin for its children FOLDERs and VDSs."
                )
                acl = self._get_acl(acl_entity)
            self._process_space_children(entity, acl)

    def _process_source(self, source):
        entity = self._get_entity_definition(source)
        if entity is None:
            self._logger.error(
                "_process_source: error reading entity for container: " +
                self._utils.get_entity_desc(source))
        else:
            if self._config.source_cascade_acl_origin_override_object is None:
                # Use Source ACL as an 'origin'
                self._logger.info(
                    "_process_source: SOURCE: '" + str(source['path']) +
                    "' will be used as an ACL Origin for its children PDSs.")
                acl = self._get_acl(entity)
            else:
                # Use ACL from a configured object
                acl_entity = self._dremio_env.get_catalog_entity_by_path(
                    self._config.source_cascade_acl_origin_override_object)
                if acl_entity is None:
                    self._logger.error(
                        "_process_source: error reading origin entity for path: "
                        + str(self._config.
                              source_cascade_acl_origin_override_object))
                    return
                self._logger.info(
                    "_process_source: SOURCE: '" + str(source['path']) +
                    "' Using override origin instead as an ACL Origin for its children PDSs."
                )
                acl = self._get_acl(acl_entity)
            # Process PDSs
            if self._config.pds_list_useapi:
                self._process_source_children(entity, acl)
            else:
                for pds in self._pds_list:
                    # Does the PDS belong to the current Source
                    if pds['path'][0] == source['path'][0]:
                        self._logger.debug("_process_source: pds: " +
                                           self._utils.get_entity_desc(pds))
                        if self._filter.match_pds_filter(pds):
                            self._logger.debug(
                                "_process_source_children: applying ACL to PDS: "
                                + self._utils.get_entity_desc(pds))
                            self._apply_acl(pds, acl)

    def _process_source_children(self, parent_entity, acl):
        # This is a recursive function
        if 'children' not in parent_entity:
            return
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_process_source_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        self._logger.debug(
            "_process_source_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        for child in parent_entity['children']:
            child_entity = self._get_entity_definition(child)
            if child_entity is None:
                self._logger.error(
                    "_process_source_children: error reading entity for: " +
                    self._utils.get_entity_desc(child))
            if child['type'] == "DATASET":
                if self._filter.match_pds_filter(child_entity):
                    self._logger.debug(
                        "_process_source_children: applying ACL to PDS: " +
                        self._utils.get_entity_desc(child_entity))
                    self._apply_acl(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_source_children: skipping PDS: " +
                        str(child_entity['path']) +
                        "as per filter configuration")
            elif child['type'] == "FILE":
                self._logger.info("_process_source_children: skipping FILE: " +
                                  self._utils.get_entity_desc(child_entity))
            elif 'containerType' in child and child[
                    'containerType'] == "FOLDER":
                if self._filter.match_source_folder_filter(child_entity):
                    self._process_source_children(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_source_children: skipping FOLDER: " +
                        str(child_entity['path']) +
                        "as per filter configuration")

    def _process_space_children(self, parent_entity, acl):
        # This is a recursive function
        if 'children' not in parent_entity:
            return
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_process_space_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        self._logger.debug(
            "_process_space_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        for child in parent_entity['children']:
            child_entity = self._get_entity_definition(child)
            if child_entity is None:
                self._logger.error(
                    "_process_space_children: error reading entity for: " +
                    self._utils.get_entity_desc(child))
            if child['type'] == "DATASET":
                if self._filter.match_vds_filter(child_entity):
                    self._logger.debug(
                        "_process_space_children: applying ACL to VDS: " +
                        self._utils.get_entity_desc(child_entity))
                    self._apply_acl(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_space_children: skipping VDS: " +
                        self._utils.get_entity_desc(child_entity))
            elif child['containerType'] == "FOLDER":
                if self._filter.match_space_folder_filter(child_entity):
                    if self._filter.match_space_folder_cascade_acl_origin_filter(
                            child_entity):
                        self._logger.info(
                            "_process_space_children: FOLDER: " +
                            str(child_entity['path']) +
                            " will be used as an ACL Origin for its children.")
                        self._process_space_children(
                            child_entity, self._get_acl(child_entity))
                    else:
                        self._logger.info(
                            "_process_space_children: applying ACL to FOLDER: "
                            + self._utils.get_entity_desc(child_entity))
                        self._apply_acl(child_entity, acl)
                        self._process_space_children(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_space_children: skipping FOLDER: " +
                        self._utils.get_entity_desc(child_entity))
                    self._process_space_children(child_entity, acl)

    def _get_entity_definition(self, src):
        if 'id' not in src:
            self._logger.error(
                "_read_entity_definition: bad data, skipping entity: " +
                self._utils.get_entity_desc(src))
            return None
        else:
            entity = self._dremio_env.get_catalog_entity_by_id(src['id'])
            if entity is None:
                self._logger.error(
                    "_read_entity_definition: cannot retrieve entity for id: "
                    + src['id'])
            return entity

    def _get_acl(self, entity):
        if 'accessControlList' in entity:
            return entity['accessControlList']
        else:
            self._logger.fatal("ACL is not defined for " +
                               self._utils.get_entity_desc(entity))
            return None

    def _apply_acl(self, entity, acl):
        # Clear the current ACL definition
        if 'accessControlList' not in entity:
            entity['accessControlList'] = {"version": "0"}
        if 'users' in entity['accessControlList']:
            entity['accessControlList'].pop('users')
        if 'groups' in entity['accessControlList']:
            entity['accessControlList'].pop('groups')
        # Apply ACL to entity
        if 'users' in acl:
            entity['accessControlList']['users'] = acl['users']
        if 'groups' in acl:
            entity['accessControlList']['groups'] = acl['groups']
        if self._config.dry_run:
            self._logger.warn("_apply_acl: Dry Run, NOT Updating entity: " +
                              self._utils.get_entity_desc(entity))
            return False
        self._logger.info("_apply_acl: updating entity: " +
                          self._utils.get_entity_desc(entity))
        updated_entity = self._dremio_env.update_catalog_entity(
            entity['id'], entity, self._config.dry_run)
        if updated_entity is None:
            self._logger.error("_apply_acl: Error updating entity: " +
                               self._utils.get_entity_desc(entity))
            return False
        return True

    def get_errors_count(self):
        return self._logger.errors_encountered
Пример #10
0
class DremioDescribeJob:

    # Dremio Cloner Configuration, Utils, ...
    _config = None
    _utils = None
    _logger = None

    # Dremio Environment to write to
    _dremio_env = None

    # Working lists
    _pds_list = []
    _vds_list = []
    _final_sql = ""

    def __init__(self, source_dremio, config):
        self._config = config
        self._dremio_env = source_dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)

    def describe_job_sql_dependencies(self):
        sql = self._config.job_sql
        self._process_sql(sql)
        # Write output files
        pass
        a = 1

    # recursive function
    def _process_sql(self, sql, sql_context=None):
        if sql_context is not None:
            schema = self._utils.normalize_path(sql_context) + "/"
        else:
            schema = ""
        paths = parse_sql.tables_in_query(sql)
        # Collect all PDS and VDS with the entire dependency hierarchy
        for path in paths:
            self._discover_dependencies(schema + path)
        # Create SQL statements for all dependencies
        for pds in self._pds_list:
            self._process_pds(pds)
        for vds in self._vds_list:
            self._process_vds(vds)
        # Write file
        self._write_file()

    def _discover_dependencies(self, path):
        dataset = self._dremio_env.get_catalog_entity_by_path(path)
        if dataset is not None:
            if dataset['type'] == 'VIRTUAL_DATASET':
                self._vds_list.append(dataset)
            elif dataset['type'] == 'PHYSICAL_DATASET':
                self._pds_list.append(dataset)
                return
            else:
                self._logger.fatal(
                    "_discover_dependencies: Unknown Entity Type: " +
                    dataset['type'])
        else:
            self._logger.fatal(
                "_discover_dependencies: Could not resolve dependency: " +
                path)
        # Process recursive dependencies
        sql_dependency_paths = parse_sql.tables_in_query(dataset['sql'])
        for dataset_dependency_path in sql_dependency_paths:
            sql_context = self._utils.get_sql_context(dataset)
            self._discover_dependencies(
                self._utils.get_absolute_path(dataset_dependency_path,
                                              sql_context))

    def _process_pds(self, pds):
        fields = pds['fields']
        sql_context = self._utils.get_sql_context(pds)
        name = pds['path'][-1:][0]
        stmt = 'CREATE TABLE ' + name + ' ('
        for field in fields:
            stmt = stmt + field['name'] + ' ' + field['type']['name'] + ', '
        stmt = stmt[:-2] + ')'
        comment = '-- PDS: ' + self._utils.get_absolute_path(
            pds['path'], sql_context)
        self._final_sql = self._final_sql + comment + "\n" + stmt + ";\n\n"

    def _process_vds(self, vds):
        fields = vds['fields']
        sql_context = self._utils.get_sql_context(vds)
        name = vds['path'][-1:][0]
        vds_sql = vds['sql']
        stmt = 'CREATE VIEW ' + name + ' AS ' + vds_sql + ";\n"
        comment = '-- VDS: ' + self._utils.get_absolute_path(
            vds['path'], sql_context)
        self._final_sql = self._final_sql + comment + "\n" + stmt + ";\n\n"

    def _write_file(self):
        f = open(self._config.target_filename, "w")
        f.write(self._final_sql)
        f.close()

    def get_errors_count(self):
        return self._looger.errors_encountered
Пример #11
0
class DremioReportReflections:

    # Dremio Cloner Configuration, Utils, ...
    _config = None
    _utils = None
    _logger = None

    # Dremio object pointing to the source Dremio environment
    _dremio_env = None

    # Misc
    _delimeter = None
    _newline = None
    _report_reflections = []

    def __init__(self, source_dremio, config):
        self._config = config
        self._dremio_env = source_dremio
        self._delimeter = self._config.report_csv_delimiter
        self._newline = self._config.report_csv_newline
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)

    def process_dremio_reflections(self):
        _query_reflections = self._retrieve_reflections()
        for query_reflection in _query_reflections:
            api_reflection = self._dremio_env.get_reflection(
                query_reflection['REFLECTION_ID'])
            dataset_entity = self._dremio_env.get_catalog_entity_by_path(
                self._normalize_dataset_path(query_reflection['DATASET']))
            if dataset_entity is None:
                self._logger.error(
                    "process_dremio_reflections: unable to retrieve dataset from API: "
                    + query_reflection['DATASET'])
                source_pds_list = []
            else:
                graph = self._dremio_env.get_catalog_entity_graph_by_id(
                    dataset_entity['id'])
                if graph is None:
                    self._logger.error(
                        "process_dremio_reflections: unable to retrieve Graph for dataset from API: "
                        + query_reflection['DATASET'])
                    source_pds_list = []
                elif len(graph['parents']) == 0:
                    source_pds_list = [
                        self._utils.normalize_path(dataset_entity['path'])
                    ]
                else:
                    source_pds_list = list(
                        self._get_dependency_pds_list(graph['parents']))
            self._report_reflections.append({
                'ID':
                query_reflection['REFLECTION_ID'],
                'NAME':
                query_reflection['NAME'],
                'STATUS':
                query_reflection['STATUS'],
                'TYPE':
                query_reflection['TYPE'],
                'DATASET_PATH':
                query_reflection['DATASET'],
                'MEASURES':
                query_reflection['measures'],
                'DIMENSIONS':
                query_reflection['dimensions'],
                'DISPLAY_COLUMNS':
                query_reflection['displayColumns'],
                'SORT_COLUMNS':
                query_reflection['sortColumns'],
                'PARTITION_COLUMNS':
                query_reflection['partitionColumns'],
                'DISTRIBUTION_COLUMNS':
                query_reflection['distributionColumns'],
                'EXTERNAL_REFLECTION':
                query_reflection['externalReflection'],
                'NUM_FAILURES':
                query_reflection['NUM_FAILURES'],
                'STATUS_EXTENDED':
                '' if api_reflection is None else api_reflection['status'],
                'TOTAL_SIZE_BYTES':
                '' if api_reflection is None else
                api_reflection['totalSizeBytes'],
                'ENABLED':
                '' if api_reflection is None else api_reflection['enabled'],
                'PARTITION_DISTRIBUTION_STRATEGY':
                '' if api_reflection is None else
                api_reflection['partitionDistributionStrategy'],
                'CREATED_AT':
                '' if api_reflection is None else api_reflection['createdAt'],
                'UPDATED_AT':
                '' if api_reflection is None else api_reflection['updatedAt'],
                'SOURCE_PDS_LIST':
                source_pds_list
            })
        self.save_dremio_report_reflections()

    def _retrieve_reflections(self):
        sql = 'SELECT REFLECTION_ID, NAME, TYPE, STATUS, NUM_FAILURES, DATASET, sortColumns, partitionColumns, distributionColumns, dimensions, measures, displayColumns, externalReflection FROM SYS.REFLECTIONS '
        jobid = self._dremio_env.submit_sql(sql)
        # Wait for the job to complete. Should only take a moment
        while True:
            job_info = self._dremio_env.get_job_info(jobid)
            self._logger.debug(
                "_retrieve_reflections: waiting for SQL query to finish. Job status: "
                + job_info["jobState"])
            if job_info is None:
                self._logger.fatal(
                    "_retrieve_reflections: unexpected error. Cannot get a list of Reflections."
                )
            if job_info["jobState"] in ['CANCELED', 'FAILED']:
                self._logger.fatal(
                    "_retrieve_reflections: unexpected error, SQL job failed. Cannot get a list of PDS."
                )
            if job_info["jobState"] == 'COMPLETED':
                break
            time.sleep(1)
        # Retrieve list of PDS
        job_result = self._dremio_env.get_job_result(jobid)
        num_rows = int(job_result['rowCount'])
        if num_rows == 0:
            self._logger.warn("_retrieve_reflections: no Reflections found.")
            return
        self._logger.debug("_retrieve_reflections: processing " +
                           str(num_rows) + " Reflectionss in batches of 100.")
        # Page through the results, 100 rows per page
        limit = 100
        reflections = []
        for i in range(0, int(num_rows / limit) + 1):
            self._logger.debug("_retrieve_reflections: processing batch " +
                               str(i + 1))
            job_result = self._dremio_env.get_job_result(
                jobid, limit * i, limit)
            for row in job_result['rows']:
                reflections.append(row)
        return reflections

    def _get_dependency_pds_list(self, parents):
        pds_set = set()
        for dataset in parents:
            if dataset['datasetType'] == 'PROMOTED' or dataset[
                    'datasetType'] == 'DIRECT':
                pds_set.add(self._utils.normalize_path(dataset['path']))
            elif dataset['datasetType'] == 'VIRTUAL':
                graph = self._dremio_env.get_catalog_entity_graph_by_id(
                    dataset['id'])
                pds_set |= self._get_dependency_pds_list(graph['parents'])
            else:
                self._logger.fatal(
                    "_gather_dependency_pds_list: unexpected entity type " +
                    dataset['datasetType'])
        return pds_set

    def _get_optimization_confidence_pct(self, reflection):
        if len(reflection['SOURCE_PDS_LIST']) == 0:
            return 0
        max_match_count = 0
        for r in self._report_reflections:
            # Match only with another reflection of the same TYPE (RAW/AGGREGATION)
            if r == reflection or r['TYPE'] != reflection['TYPE']:
                continue
            match_count = 0
            for s in r['SOURCE_PDS_LIST']:
                if s in reflection['SOURCE_PDS_LIST']:
                    match_count = match_count + 1
            if match_count > max_match_count:
                max_match_count = match_count
        return max_match_count * 100 / len(reflection['SOURCE_PDS_LIST'])

    def save_dremio_report_reflections(self):
        self._f = open(self._config.target_filename, "w")
        self._f.write('REFLECTION_ID' + self._delimeter + 'NAME' +
                      self._delimeter + 'STATUS' + self._delimeter + 'TYPE' +
                      self._delimeter + 'OPTIMIZATION_CONFIDENCE_PCT' +
                      self._delimeter + 'DATASET_PATH' + self._delimeter +
                      'MEASURES' + self._delimeter + 'DIMENSIONS' +
                      self._delimeter + 'DISPLAY_COLUMNS' + self._delimeter +
                      'SORT_COLUMNS' + self._delimeter + 'PARTITION_COLUMNS' +
                      self._delimeter + 'DISTRIBUTION_COLUMNS' +
                      self._delimeter + 'EXTERNAL_REFLECTION' +
                      self._delimeter + 'NUM_FAILURES' + self._delimeter +
                      'STATUS_EXTENDED' + self._delimeter +
                      'TOTAL_SIZE_BYTES' + self._delimeter + 'ENABLED' +
                      self._delimeter + 'PARTITION_DISTRIBUTION_STRATEGY' +
                      self._delimeter + 'CREATED_AT' + self._delimeter +
                      'UPDATED_AT' + self._delimeter + 'SOURCE_PDS_LIST' +
                      self._newline)

        for reflection in self._report_reflections:
            line = str(reflection['ID']) + self._delimeter + \
                str(reflection['NAME']) + self._delimeter + \
                str(reflection['STATUS']) + self._delimeter + \
                str(reflection['TYPE']) + self._delimeter + \
                str(self._get_optimization_confidence_pct(reflection)) + self._delimeter + \
                str(reflection['DATASET_PATH']) + self._delimeter + \
                str(reflection['MEASURES']) + self._delimeter + \
                str(reflection['DIMENSIONS']) + self._delimeter + \
                str(reflection['DISPLAY_COLUMNS']) + self._delimeter + \
                str(reflection['SORT_COLUMNS']) + self._delimeter + \
                str(reflection['PARTITION_COLUMNS']) + self._delimeter + \
                str(reflection['DISTRIBUTION_COLUMNS']) + self._delimeter + \
                str(reflection['EXTERNAL_REFLECTION']) + self._delimeter + \
                str(reflection['NUM_FAILURES']) + self._delimeter + \
                str(reflection['STATUS_EXTENDED']) + self._delimeter + \
                str(reflection['TOTAL_SIZE_BYTES']) + self._delimeter + \
                str(reflection['ENABLED']) + self._delimeter + \
                str(reflection['PARTITION_DISTRIBUTION_STRATEGY']) + self._delimeter + \
                str(reflection['CREATED_AT']) + self._delimeter + \
                str(reflection['UPDATED_AT']) + self._delimeter + \
                str(reflection['SOURCE_PDS_LIST']) + self._newline
            self._f.write(line)
        self._f.close()

    def _normalize_dataset_path(self, path):
        path = path.split('.')
        normalized_path = ""
        for i in range(0, len(path)):
            if path[i].startswith('"') and path[i].endswith('"'):
                normalized_path = normalized_path + path[i][1:-1]
            else:
                normalized_path = normalized_path + path[i]
            if normalized_path.startswith('"') and normalized_path.endswith(
                    '"'):
                normalized_path = normalized_path[1:-1]
            entity = self._dremio_env.get_catalog_entity_by_path(
                normalized_path, report_error=False)
            if entity is not None:
                normalized_path = normalized_path + '/'
            else:
                normalized_path = normalized_path + '.'
        return normalized_path[:-1]
Пример #12
0
class DremioDelete:

    # Dremio Cloner Config, Logger, Utils
    _config = None
    _logger = None
    _utils = None
    _filter = None

    # Dremio Environment to write to
    _dremio_env = None

    # List of PDS for processing
    _pds_list = None

    def __init__(self, dremio, config):
        self._config = config
        self._dremio_env = dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)
        self._filter = DremioClonerFilter(config)

    def delete(self):
        # Delete VDSs
        if (self._config.vds_process_mode != "delete"):
            self._logger.info(
                "delete: Not deleting VDS as per 'vds.process_mode' configuration"
            )
        else:
            for vds_path in self._config.delete_vds:
                vds_json = self._dremio_env.get_catalog_entity_by_path(
                    vds_path, report_error=True)
                if (vds_json is None):
                    self._logger.error(
                        "delete: unable to find VDS for path: '" + vds_path +
                        "'")
                else:
                    self._dremio_env.delete_catalog_entity(
                        vds_json["id"],
                        dry_run=self._config.dry_run,
                        report_error=True)
        # Delete Folders
        if (self._config.folder_process_mode != "delete"):
            self._logger.info(
                "delete: Not deleting Folders as per 'folder.process_mode' configuration"
            )
        else:
            for folder_path in self._config.delete_folders:
                folder_json = self._dremio_env.get_catalog_entity_by_path(
                    folder_path, report_error=True)
                if (folder_json is None):
                    self._logger.error(
                        "delete: unable to find Folder for path: '" +
                        folder_path + "'")
                else:
                    self._dremio_env.delete_catalog_entity(
                        folder_json["id"],
                        dry_run=self._config.dry_run,
                        report_error=True)

    def get_errors_count(self):
        return self._logger.errors_encountered
Пример #13
0
 def __init__(self, config):
     self._config = config
     self._logger = DremioClonerLogger(self._config.max_errors,
                                       self._config.logging_verbose)
     self._utils = DremioClonerUtils(config)
Пример #14
0
class DremioClonerFilter():

    _config = None
    _utils = None
    _logger = None

    def __init__(self, config):
        self._config = config
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)

    def is_pds_in_scope(self):
        return self._config._source_filter_re is not None and \
          self._config._pds_filter_re is not None and \
             self._config.source_folder_exclude_filter != '*' and \
          self._config.pds_exclude_filter != '*' and \
          self._config.pds_process_mode == 'process'

    def _match_listed_space_names(self, container):
        if self._config.space_filter_names != [] and ( \
            ('path' in container and container['path'][0] not in self._config.space_filter_names) \
            or ('name' in container and container['name'] not in self._config.space_filter_names) ):
            return False
        return True

    def match_space_filter(self, container, loginfo=False):
        if not self._match_listed_space_names(container):
            return False
        # Filter by space name pattern
        if self._match_path(self._config._space_filter_re,
                            self._config._space_exclude_filter_re, None, None,
                            None, None, container):
            return True
        if loginfo:
            self._logger.info("match_space_filter: skipping SPACE " +
                              container['path'][0] if 'path' in
                              container else container['name'] +
                              " as per job configuration")
        return False

    def match_space_folder_filter(self, container, loginfo=True):
        if not self._match_listed_space_names(container):
            return False
        if self._match_path(self._config._space_filter_re,
                            self._config._space_exclude_filter_re,
                            self._config._space_folder_filter_re,
                            self._config._space_folder_exclude_filter_re, None,
                            None, container):
            return True
        if loginfo:
            self._logger.debug(
                "match_space_folder_filter: skipping SPACE FOLDER " +
                container['path'][0] if 'path' in
                container else container['name'] + " as per job configuration")
        return False

    def match_space_folder_cascade_acl_origin_filter(self, container):
        if self._config.space_folder_cascade_acl_origin_filter is None:
            return False
        elif (  # Do not filter out folders in HOME hierarchies
            (container['path'][0][:1] == '@') or
                # Match both Folder filter and Space filter
            ((self._config._space_folder_cascade_acl_origin_filter_re.match(
                self._utils.normalize_path(container['path'][1:])) is not None)
             and self.match_space_filter(container))):
            return True
        else:
            return False

    def match_source_filter(self, container, loginfo=True):
        # First filter by source types
        if container['type'] != 'CONTAINER' and self._config.source_filter_types != [] and (container['entityType'] != 'source' or container['type'] not in self._config.source_filter_types):
            return False
        # Also filter by source names
        if container['type'] != 'CONTAINER' and self._config.source_filter_names != [] and (container['entityType'] != 'source' or container['name'] not in self._config.source_filter_names):
            return False
        # Finally filter by filter pattern
        if self._match_path(self._config._source_filter_re,
                            self._config._source_exclude_filter_re, None, None,
                            None, None, container):
            return True
        if loginfo:
            self._logger.debug("match_source_filter: skipping SOURCE " +
                               container['path'][0] if 'path' in
                               container else container['name'] +
                               " as per job configuration")
        return False

    def match_source_folder_filter(self, container, loginfo=True):
        if self._match_path(self._config._source_filter_re,
                            self._config._source_exclude_filter_re,
                            self._config._source_folder_filter_re,
                            self._config._source_folder_exclude_filter_re,
                            None, None, container):
            return True
        if loginfo:
            self._logger.debug(
                "match_source_folder_filter: skipping SOURCE FOLDER " +
                container['path'][0] if 'path' in
                container else container['name'] + " as per job configuration")
        return False

    def match_pds_filter(self, pds, loginfo=True):
        if self._match_path(self._config._source_filter_re,
                            self._config._source_exclude_filter_re,
                            self._config._source_folder_filter_re,
                            self._config._source_folder_exclude_filter_re,
                            self._config._pds_filter_re,
                            self._config.pds_exclude_filter, pds):
            return True
        if loginfo:
            self._logger.debug("match_pds_filter: skipping PDS " +
                               pds['path'][-1] if 'path' in
                               pds else pds['name'] +
                               " as per job configuration")
        return False

    def match_vds_filter(self, vds, tags=None, loginfo=True):
        if not self._match_listed_space_names(vds):
            return False
        if self._match_path(self._config._space_filter_re,
                            self._config._space_exclude_filter_re,
                            self._config._space_folder_filter_re,
                            self._config._space_folder_exclude_filter_re,
                            self._config._vds_filter_re,
                            self._config._vds_exclude_filter_re, vds):
            if self._config.vds_filter_tag is None or self._config.vds_filter_tag == "*":
                return True
            elif tags is not None and self._match_tag(tags):
                return True
        if loginfo:
            self._logger.debug("match_vds_filter: skipping VDS " +
                               vds['path'][-1] if 'path' in
                               vds else vds['name'] +
                               " as per job configuration")
        return False

    def _match_tag(self, tags):
        if 'tags' not in tags:
            return False
        for tag in tags['tags']:
            if tag == self._config.vds_filter_tag:
                return True
        return False

    def match_reflection_path(self, reflection_path, reflection_dataset):
        if 'type' in reflection_dataset and reflection_dataset[
                'type'] == 'VIRTUAL_DATASET':
            if self._match_hierarchy_path(
                    self._config._space_filter_re,
                    self._config._space_exclude_filter_re,
                    self._config._space_folder_filter_re,
                    self._config._space_folder_exclude_filter_re,
                    self._config._vds_filter_re,
                    self._config._vds_exclude_filter_re, reflection_path):
                return True
        else:
            if self._match_hierarchy_path(
                    self._config._source_filter_re,
                    self._config._source_exclude_filter_re,
                    self._config._source_folder_filter_re,
                    self._config._source_folder_exclude_filter_re,
                    self._config._pds_filter_re,
                    self._config._pds_exclude_filter_re, reflection_path):
                return True
        return False

    def _match_hierarchy_path(self, root_re, root_exclusion_re, folder_re,
                              folder_exclusion_re, object_re,
                              object_exclusion_re, hierarchy_path):
        if root_re is None:
            return False
        # Match root object (Space of Source)
        if root_re.match(hierarchy_path[0]) is None:
            return False
        if root_exclusion_re is not None and root_exclusion_re.match(
                hierarchy_path[0]) is not None:
            return False
        # Match object
        if object_re is not None and object_re.match(
                self._utils.normalize_path(hierarchy_path[-1])) is None:
            return False
        if object_exclusion_re is not None and object_exclusion_re.match(
                self._utils.normalize_path(hierarchy_path[1:])) is not None:
            return False
        # Match Folders. Note, child folders do not need to be matched if its parent match
        if folder_re is None:
            return False
        else:
            folder_matched = False
            for i in range(len(hierarchy_path)):
                if folder_re.match(
                        self._utils.normalize_path(
                            hierarchy_path[1:len(hierarchy_path) -
                                           i])) is not None:
                    folder_matched = True
                    break
            if not folder_matched:
                return False
            if folder_exclusion_re is not None:
                folder_exclusion_matched = False
                for i in range(len(hierarchy_path)):
                    if folder_exclusion_re.match(
                            self._utils.normalize_path(
                                hierarchy_path[1:len(hierarchy_path) -
                                               i])) is not None:
                        folder_exclusion_matched = True
                        break
                if folder_exclusion_matched:
                    return False
        return True

    def _match_path(self, root_re, root_exclusion_re, folder_re,
                    folder_exclusion_re, object_re, object_exclusion_re,
                    entity):
        # If inclusion filter is not specified, nothing to process
        if root_re is None:
            return False
        # Validate parameters
        if ('containerType' in entity and entity['containerType'] == 'SPACE') or \
           ('entityType' in entity and entity['entityType'] == 'space') or \
           ('containerType' in entity and entity['containerType'] == 'SOURCE') or \
           ('entityType' in entity and entity['entityType'] == 'source') :
            pass
        elif ('entityType' in entity and entity['entityType'] == 'folder') or \
          ('containerType' in entity and entity['containerType'] == 'FOLDER'):
            if root_re is None:  # Not validating folder_re as the call might be to validate if the folder is from the unfiltered space
                return False
        elif ('entityType' in entity and entity['entityType'] == 'dataset') or \
          ('type' in entity and entity['type'] == 'DATASET'):
            if root_re is None:  # Not validating folder_re, object_re as the call might be to validate if the folder is from the unfiltered space
                return False
        else:
            self._logger.fatal("_match_path: Unexpected Entity Type " +
                               str(entity))
        if 'path' not in entity:
            return root_exclusion_re is None or root_exclusion_re.match(
                entity['name'])
        else:
            path = entity['path']
            # Match root object (Space of Source)
            if root_re.match(path[0]) is None:
                return False
            if root_exclusion_re is not None and root_exclusion_re.match(
                    path[0]) is not None:
                return False
            # Match object
            if object_re is not None and object_re.match(
                    self._utils.normalize_path(path[-1])) is None:
                return False
            if object_exclusion_re is not None and object_exclusion_re.match(
                    self._utils.normalize_path(path[1:])) is not None:
                return False
            # Match Folders. Note, child folders do not need to be matched if its parent match
            if folder_re is not None or folder_exclusion_re is not None:
                folder_matched = False
                for i in range(len(path)):
                    if folder_re.match(
                            self._utils.normalize_path(path[1:len(path) -
                                                            i])) is not None:
                        folder_matched = True
                        break
                if not folder_matched:
                    return False
                if folder_exclusion_re is not None:
                    folder_exclusion_matched = False
                    for i in range(len(path)):
                        if folder_exclusion_re.match(
                                self._utils.normalize_path(
                                    path[1:len(path) - i])) is not None:
                            folder_exclusion_matched = True
                            break
                    if folder_exclusion_matched:
                        return False
        return True