def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = job_type == "MapReduce" and subtype == "Streaming" # Pig or Hive flow has to contain script in mains, may also use libs if job_type in ['Pig', 'Hive']: if not mains: raise e.InvalidDataException("%s flow requires main script" % data.get("type")) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException("'mains' and 'libs' overlap") else: if not streaming and not libs: raise e.InvalidDataException("%s flow requires libs" % data.get("type")) if mains: raise e.InvalidDataException("%s flow does not use mains" % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def _check_swift_data_source_create(data): if len(data['url']) == 0: raise ex.InvalidDataException(_("Swift url must not be empty")) url = urlparse.urlparse(data['url']) if url.scheme != "swift": raise ex.InvalidDataException(_("URL scheme must be 'swift'")) # The swift url suffix does not have to be included in the netloc. # However, if the swift suffix indicator is part of the netloc then # we require the right suffix. # Additionally, the path must be more than '/' if (su.SWIFT_URL_SUFFIX_START in url.netloc and not url.netloc.endswith( su.SWIFT_URL_SUFFIX)) or len(url.path) <= 1: raise ex.InvalidDataException( _("URL must be of the form swift://container%s/object") % su.SWIFT_URL_SUFFIX) if not CONF.use_domain_for_proxy_users and "credentials" not in data: raise ex.InvalidCredentials(_("No credentials provided for Swift")) if not CONF.use_domain_for_proxy_users and ("user" not in data["credentials"]): raise ex.InvalidCredentials( _("User is not provided in credentials for Swift")) if not CONF.use_domain_for_proxy_users and ("password" not in data["credentials"]): raise ex.InvalidCredentials( _("Password is not provided in credentials for Swift"))
def check_shares(data): if not data: return paths = (share.get('path') for share in data) paths = [path for path in paths if path is not None] if len(paths) != len(set(paths)): raise ex.InvalidDataException( _('Multiple shares cannot be mounted to the same path.')) for path in paths: if not path.startswith('/') or '\x00' in path: raise ex.InvalidDataException( _('Paths must be absolute Linux paths starting with "/" ' 'and may not contain nulls.')) client = manila.client() for share in data: manila_share = manila.get_share(client, share['id']) if not manila_share: raise ex.InvalidReferenceException( _("Requested share id %s does not exist.") % share['id']) share_type = manila_share.share_proto if share_type not in shares.SUPPORTED_SHARE_TYPES: raise ex.InvalidReferenceException( _("Requested share id %(id)s is of type %(type)s, which is " "not supported by Sahara.") % { "id": share['id'], "type": share_type })
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and subtype == edp.JOB_SUBTYPE_STREAMING) # Pig or Hive flow has to contain script in mains, may also use libs if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE]: if not mains: raise e.InvalidDataException("%s flow requires main script" % data.get("type")) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException("'mains' and 'libs' overlap") else: if not streaming and not libs: raise e.InvalidDataException("%s flow requires libs" % data.get("type")) if mains: raise e.InvalidDataException("%s flow does not use mains" % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def check_job_executor(data, job_id): job = api.get_job(job_id) job_type, subtype = edp.split_job_type(job.type) # Check if cluster contains Oozie service to run job main_base.check_edp_job_support(data['cluster_id']) # All types except Java require input and output objects if job_type == 'Java': if not _is_main_class_present(data): raise ex.InvalidDataException('Java job must ' 'specify edp.java.main_class') else: if not ('input_id' in data and 'output_id' in data): raise ex.InvalidDataException("%s job requires 'input_id' " "and 'output_id'" % job.type) b.check_data_source_exists(data['input_id']) b.check_data_source_exists(data['output_id']) b.check_data_sources_are_different(data['input_id'], data['output_id']) if job_type == 'MapReduce' and ( subtype == 'Streaming' and not _streaming_present(data)): raise ex.InvalidDataException("%s job " "must specify streaming mapper " "and reducer" % job.type) main_base.check_cluster_exists(data['cluster_id'])
def _validate_url(self, url): if len(url) == 0: raise ex.InvalidDataException(_("MapR FS url must not be empty")) url = urlparse.urlparse(url) if url.scheme: if url.scheme != "maprfs": raise ex.InvalidDataException(_("URL scheme must be 'maprfs'"))
def _check_maprfs_data_source_create(data): if len(data['url']) == 0: raise ex.InvalidDataException(_("MapR FS url must not be empty")) url = urlparse.urlparse(data['url']) if url.scheme: if url.scheme != "maprfs": raise ex.InvalidDataException(_("URL scheme must be 'maprfs'"))
def check_mains_libs(data, **kwargs): mains = data.get("mains", []) libs = data.get("libs", []) job_type, subtype = edp.split_job_type(data.get("type")) streaming = (job_type == edp.JOB_TYPE_MAPREDUCE and subtype == edp.JOB_SUBTYPE_STREAMING) # These types must have a value in mains and may also use libs if job_type in [edp.JOB_TYPE_PIG, edp.JOB_TYPE_HIVE, edp.JOB_TYPE_SPARK]: if not mains: if job_type == edp.JOB_TYPE_SPARK: msg = _("%s job requires main application jar") % data.get( "type") else: msg = _("%s flow requires main script") % data.get("type") raise e.InvalidDataException(msg) # Check for overlap if set(mains).intersection(set(libs)): raise e.InvalidDataException(_("'mains' and 'libs' overlap")) else: # Java and MapReduce require libs, but MapReduce.Streaming does not if not streaming and not libs: raise e.InvalidDataException( _("%s flow requires libs") % data.get("type")) if mains: raise e.InvalidDataException( _("%s flow does not use mains") % data.get("type")) # Make sure that all referenced binaries exist _check_binaries(mains) _check_binaries(libs)
def _validate_url(self, url): if len(url) == 0: raise ex.InvalidDataException(_("HDFS url must not be empty")) url = urlparse.urlparse(url) if url.scheme: if url.scheme != "hdfs": raise ex.InvalidDataException(_("URL scheme must be 'hdfs'")) if not url.hostname: raise ex.InvalidDataException( _("HDFS url is incorrect, cannot determine a hostname"))
def _validate_labels_update(self, default_data, update_values): for label in update_values.keys(): if label not in default_data.keys(): raise ex.InvalidDataException( _("Label '%s' can't be updated because it's not " "available for plugin or its version") % label) if not default_data[label][MUTABLE]: raise ex.InvalidDataException( _("Label '%s' can't be updated because it's not " "mutable") % label)
def get_service(self, node_process): ui_name = self.get_service_name_by_node_process(node_process) if ui_name is None: raise e.InvalidDataException( _('Service not found in services list')) version = self.get_chosen_service_version(ui_name) service = self._find_service_instance(ui_name, version) if service is None: raise e.InvalidDataException(_('Can not map service')) return service
def _check_manila_data_source_create(data): if len(data['url']) == 0: raise ex.InvalidDataException(_("Manila url must not be empty")) url = urlparse.urlparse(data['url']) if url.scheme != "manila": raise ex.InvalidDataException(_("Manila url scheme must be 'manila'")) if not uuidutils.is_uuid_like(url.netloc): raise ex.InvalidDataException(_("Manila url netloc must be a uuid")) if not url.path: raise ex.InvalidDataException(_("Manila url path must not be empty"))
def _check_hdfs_data_source_create(data): if len(data['url']) == 0: raise ex.InvalidDataException(_("HDFS url must not be empty")) url = urlparse.urlparse(data['url']) if url.scheme: if url.scheme != "hdfs": raise ex.InvalidDataException(_("URL scheme must be 'hdfs'")) if not url.hostname: raise ex.InvalidDataException( _("HDFS url is incorrect, cannot determine a hostname"))
def _validate_url(self, url): if len(url) == 0: raise ex.InvalidDataException( _("Internal data base url must not be empty")) url = urlparse.urlparse(url) if url.scheme != "internal-db": raise ex.InvalidDataException( _("URL scheme must be 'internal-db'")) if not uuidutils.is_uuid_like(url.netloc): raise ex.InvalidDataException( _("Internal data base url netloc must be a uuid"))
def _validate_url(self, url): if len(url) == 0: raise ex.InvalidDataException(_("Manila url must not be empty")) url = urlparse.urlparse(url) if url.scheme != "manila": raise ex.InvalidDataException(_("Manila url scheme must be" " 'manila'")) if not uuidutils.is_uuid_like(url.netloc): raise ex.InvalidDataException(_("Manila url netloc must be a" " uuid")) if not url.path: raise ex.InvalidDataException(_("Manila url path must not be" " empty"))
def _validate_url(self, url): if len(url) == 0: raise ex.InvalidDataException(_("S3 url must not be empty")) url = urlparse.urlparse(url) if url.scheme not in ["s3", "s3a"]: raise ex.InvalidDataException( _("URL scheme must be 's3' or 's3a'")) if not url.hostname: raise ex.InvalidDataException(_("Bucket name must be present")) if not url.path: raise ex.InvalidDataException(_("Object name must be present"))
def _check_datasource_placeholder(url): if url is None: return total_length = 0 substrings = re.findall(r"%RANDSTR\(([\-]?\d+)\)%", url) for length in map(int, substrings): if length <= 0: raise ex.InvalidDataException(_("Requested RANDSTR length" " must be positive.")) total_length += length if total_length > 1024: raise ex.InvalidDataException(_("Requested RANDSTR length is" " too long, please choose a " "value less than 1024."))
def validate_job_execution(self, cluster, job, data): if not self.edp_supported(cluster.hadoop_version): raise ex.InvalidDataException( _('Spark {base} or higher required to run {type} jobs').format( base=EdpEngine.edp_base_version, type=job.type)) super(EdpEngine, self).validate_job_execution(cluster, job, data)
def _create_config_obj(self, item, target='general', scope='cluster', high_priority=False): def _prepare_value(value): if isinstance(value, str): return value.strip().lower() return value conf_name = _prepare_value(item.get('name', None)) conf_value = _prepare_value(item.get('value', None)) if not conf_name: raise ex.HadoopProvisionError(_("Config missing 'name'")) if conf_value is None: raise e.InvalidDataException( _("Config '%s' missing 'value'") % conf_name) if high_priority or item.get('priority', 2) == 1: priority = 1 else: priority = 2 return p.Config(name=conf_name, applicable_target=target, scope=scope, config_type=item.get('config_type', "string"), config_values=item.get('config_values', None), default_value=conf_value, is_optional=item.get('is_optional', True), description=item.get('description', None), priority=priority)
def _initialize(self, config): for configuration in self.config['configurations']: for service_property in configuration['properties']: config = p.Config(service_property['name'], self._get_target( service_property['applicable_target']), service_property['scope'], config_type= service_property['config_type'], default_value=service_property ['default_value'], is_optional=service_property[ 'is_optional'], description=service_property[ 'description']) setattr(config, 'tag', configuration['tag'].rsplit(".", 1)[0]) self.config_items.append(config) #TODO(jspeidel): an assumption is made that property names # are unique across configuration sections which is dangerous property_name = service_property['name'] # if property already exists, throw an exception if property_name in self.config_mapper: # internal error # ambari-config-resource contains duplicates raise exceptions.InvalidDataException( 'Internal Error. Duplicate property ' 'name detected: %s' % property_name) self.config_mapper[service_property['name']] = \ self._get_target( service_property['applicable_target'])
def _build_cluster_schema(api_version='v1'): if api_version == 'v1': cluster_schema = copy.deepcopy(ct_schema.CLUSTER_TEMPLATE_SCHEMA) elif api_version == 'v2': cluster_schema = copy.deepcopy(ct_schema.CLUSTER_TEMPLATE_SCHEMA_V2) else: raise ex.InvalidDataException('Invalid API version %s' % api_version) cluster_schema['properties'].update({ "is_transient": { "type": "boolean" }, "user_keypair_id": { "type": "string", "format": "valid_keypair_name", }, "cluster_template_id": { "type": "string", "format": "uuid", } }) if api_version == 'v2': cluster_schema['properties'].update({"count": {"type": "integer"}}) return cluster_schema
def handler(*args, **kwargs): # NOTE (vgridnev): We should know information about instance, # so we should find instance in args or kwargs. # Also, we import sahara.conductor.resource # to check some object is Instance instance = None for arg in args: if isinstance(arg, resource.InstanceResource): instance = arg for kw_arg in kwargs.values(): if isinstance(kw_arg, resource.InstanceResource): instance = kw_arg if instance is None: raise exceptions.InvalidDataException( _("Function should have an Instance as argument")) try: value = func(*args, **kwargs) except Exception as e: with excutils.save_and_reraise_exception(): add_fail_event(instance, e) if mark_successful_on_exit: add_successful_event(instance) return value
def get_data_source_by_url(self, url): url = urlparse.urlparse(url) if not url.scheme: raise ex.InvalidDataException( _("Data source url must have a" " scheme")) return self.get_data_source(url.scheme)
def request_data(): if hasattr(flask.request, 'parsed_data'): return flask.request.parsed_data if (flask.request.content_length is None or not flask.request.content_length > 0): LOG.debug("Empty body provided in request") return dict() if flask.request.file_upload: return flask.request.data deserializer = None content_type = flask.request.mimetype if not content_type or content_type in RT_JSON: deserializer = wsgi.JSONDeserializer() else: raise ex.InvalidDataException( _("Content type '%s' isn't supported") % content_type) # parsed request data to avoid unwanted re-parsings parsed_data = deserializer.deserialize(flask.request.data)['body'] flask.request.parsed_data = parsed_data return flask.request.parsed_data
def validate_job_execution(self, cluster, job, data): if cluster.hadoop_version < "1.0.0": raise ex.InvalidDataException( _('Spark 1.0.0 or higher required to run spark %s jobs') % job.type) super(EdpEngine, self).validate_job_execution(cluster, job, data)
def check_scheduled_job_execution_info(job_execution_info): start = job_execution_info.get('start', None) if start is None: raise ex.InvalidDataException( _("Scheduled job must specify start time")) try: start = time.strptime(start, "%Y-%m-%d %H:%M:%S") start = timeutils.datetime.datetime.fromtimestamp(time.mktime(start)) except Exception: raise ex.InvalidDataException(_("Invalid Time Format")) now_time = timeutils.utcnow() if timeutils.delta_seconds(now_time, start) < 0: raise ex.InvalidJobExecutionInfoException( _("Job start time should be later than now"))
def check_data_sources_are_different(data_source_1_id, data_source_2_id): ds1 = conductor.data_source_get(context.ctx(), data_source_1_id) ds2 = conductor.data_source_get(context.ctx(), data_source_2_id) if ds1.type == ds2.type and ds1.url == ds2.url: raise ex.InvalidDataException(_('Provided input and output ' 'DataSources reference the same ' 'location: %s') % ds1.url)
def check_execution_interface(data, job): job_int = {arg.name: arg for arg in job.interface} execution_int = data.get("interface", None) if not (job_int or execution_int): return if job_int and execution_int is None: raise e.InvalidDataException( _("An interface was specified with the template for this job. " "Please pass an interface map with this job (even if empty).")) execution_names = set(execution_int.keys()) definition_names = set(job_int.keys()) not_found_names = execution_names - definition_names if not_found_names: raise e.InvalidDataException( _("Argument names: %s were not found in the interface for this " "job.") % str(list(not_found_names))) required_names = {arg.name for arg in job.interface if arg.required} unset_names = required_names - execution_names if unset_names: raise e.InvalidDataException( _("Argument names: %s are required for " "this job.") % str(list(unset_names))) nonexistent = object() for name, value in six.iteritems(execution_int): arg = job_int[name] _validate_value(arg.value_type, value) if arg.mapping_type == "args": continue typed_configs = data.get("job_configs", {}).get(arg.mapping_type, {}) config_value = typed_configs.get(arg.location, nonexistent) if config_value is not nonexistent and config_value != value: args = { "name": name, "mapping_type": arg.mapping_type, "location": arg.location } raise e.InvalidDataException( _("Argument '%(name)s' was passed both through the interface " "and in location '%(mapping_type)s'.'%(location)s'. Please " "pass this through either the interface or the " "configuration maps, not both.") % args)
def check_data_sources_are_different(data_source_1_id, data_source_2_id): ds1 = api.get_data_source(data_source_1_id) ds2 = api.get_data_source(data_source_2_id) if ds1.type == ds2.type and ds1.url == ds2.url: raise ex.InvalidDataException('Provided input and output ' 'DataSources reference the same ' 'location: %s' % ds1.url)
def validate(self, data): self._validate_url(data['url']) # Do validation loosely, and don't require much... the user might have # (by their own preference) set some or all configs manually if "credentials" not in data: return for key in data["credentials"].keys(): if key not in self.configs_map.keys(): raise ex.InvalidDataException( _("Unknown config '%s' for S3 data source") % key) if key in self.bool_keys: if not isinstance(data["credentials"][key], bool): raise ex.InvalidDataException( _("Config '%s' must be boolean") % key)