def delete_all_datasets(account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: """Delete all datasets. Parameters ---------- account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.delete_all_datasets() """ session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) for dataset in list_datasets(account_id=account_id, boto3_session=session): delete_dataset(dataset_id=dataset["DataSetId"], account_id=account_id, boto3_session=session)
def delete_all_templates( account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: """Delete all templates. Parameters ---------- account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.delete_all_templates() """ session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) for template in list_templates(account_id=account_id, boto3_session=session): # pragma: no cover delete_template(template_id=template["TemplateId"], account_id=account_id, boto3_session=session)
def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str: """Create the default Athena bucket if it doesn't exist. Parameters ---------- boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Examples -------- >>> import awswrangler as wr >>> wr.athena.create_athena_bucket() 's3://aws-athena-query-results-ACCOUNT-REGION/' """ session: boto3.Session = _utils.ensure_session(session=boto3_session) account_id: str = sts.get_account_id(boto3_session=session) region_name: str = str(session.region_name).lower() s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/" s3_resource = session.resource("s3") s3_resource.Bucket(s3_output) return s3_output
def _delete( func_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs: Any ) -> None: session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) func: Callable[..., None] = getattr(client, func_name) func(AwsAccountId=account_id, **kwargs)
def create_ingestion( dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, ingestion_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create and starts a new SPICE ingestion on a dataset. Note ---- You must pass ``dataset_name`` OR ``dataset_id`` argument. Parameters ---------- dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. ingestion_id : str, optional Ingestion ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Ingestion ID Examples -------- >>> import awswrangler as wr >>> status = wr.quicksight.create_ingestion("my_dataset") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None dataset_name or dataset_id argument." ) # pragma: no cover if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) if ingestion_id is None: ingestion_id = uuid.uuid4().hex client: boto3.client = _utils.client(service_name="quicksight", session=session) response: Dict[str, Any] = client.create_ingestion(DataSetId=dataset_id, IngestionId=ingestion_id, AwsAccountId=account_id) return response["IngestionId"]
def describe_ingestion( ingestion_id: str, dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Describe a QuickSight ingestion by ID. Note ---- You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. Parameters ---------- ingestion_id : str Ingestion ID. dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Ingestion Description. Examples -------- >>> import awswrangler as wr >>> description = wr.quicksight.describe_dataset(ingestion_id="...", dataset_name="...") """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return cast( Dict[str, Any], client.describe_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)["Ingestion"], )
def cancel_ingestion( ingestion_id: str = None, dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Cancel an ongoing ingestion of data into SPICE. Note ---- You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. Parameters ---------- ingestion_id : str Ingestion ID. dataset_name : str, optional Dataset name. dataset_id : str, optional Dataset ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...") """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)
def describe_data_source_permissions( name: Optional[str] = None, data_source_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Any]: """Describe a QuickSight data source permissions by name or ID. Note ---- You must pass a not None ``name`` or ``data_source_id`` argument. Parameters ---------- name : str, optional Data source name. data_source_id : str, optional Data source ID. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Data source Permissions Description. Examples -------- >>> import awswrangler as wr >>> description = wr.quicksight.describe_data_source_permissions("my-data-source") """ if (name is None) and (data_source_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or data_source_id argument.") session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_id is None) and (name is not None): data_source_id = get_data_source_id(name=name, account_id=account_id, boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) return cast( Dict[str, Any], client.describe_data_source_permissions( AwsAccountId=account_id, DataSourceId=data_source_id)["Permissions"], )
def _get_default_logging_path( subnet_id: Optional[str] = None, account_id: Optional[str] = None, region: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Get EMR default logging path. E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" Parameters ---------- subnet_id : str, optional Subnet ID. If not provided, you must pass `account_id` and `region` explicit. account_id: str, optional Account ID. region: str, optional Region e.g. 'us-east-1' boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Default logging path. E.g. "s3://aws-logs-{account_id}-{region}/elasticmapreduce/" Examples -------- >>> import awswrangler as wr >>> state = wr.emr._get_default_logging_path("subnet-id") 's3://aws-logs-{account_id}-{region}/elasticmapreduce/' """ if account_id is None: boto3_session = _utils.ensure_session(session=boto3_session) _account_id: str = sts.get_account_id(boto3_session=boto3_session) else: _account_id = account_id if (region is None) and (subnet_id is not None): _region: str = _utils.get_region_from_session( boto3_session=boto3_session) elif (region is None) and (subnet_id is None): raise exceptions.InvalidArgumentCombination( "You must pass region or subnet_id or both.") else: _region = region # type: ignore return f"s3://aws-logs-{_account_id}-{_region}/elasticmapreduce/"
def list_ingestions( dataset_name: Optional[str] = None, dataset_id: Optional[str] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[Dict[str, Any]]: """List the history of SPICE ingestions for a dataset. Parameters ---------- dataset_name : str, optional Dataset name. dataset_id : str, optional The ID of the dataset used in the ingestion. account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[Dict[str, Any]] IAM policy assignments. Examples -------- >>> import awswrangler as wr >>> ingestions = wr.quicksight.list_ingestions() """ if (dataset_name is None) and (dataset_id is None): raise exceptions.InvalidArgument( "You must pass a not None name or dataset_id argument." ) # pragma: no cover session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (dataset_id is None) and (dataset_name is not None): dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=session) return _list( func_name="list_ingestions", attr_name="Ingestions", account_id=account_id, boto3_session=boto3_session, DataSetId=dataset_id, )
def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str: """Create the default Athena bucket if it doesn't exist. Parameters ---------- boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Bucket s3 path (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Examples -------- >>> import awswrangler as wr >>> wr.athena.create_athena_bucket() 's3://aws-athena-query-results-ACCOUNT-REGION/' """ session: boto3.Session = _utils.ensure_session(session=boto3_session) account_id: str = sts.get_account_id(boto3_session=session) region_name: str = str(session.region_name).lower() bucket_name = f"aws-athena-query-results-{account_id}-{region_name}" path = f"s3://{bucket_name}/" resource = _utils.resource(service_name="s3", session=session) bucket = resource.Bucket(bucket_name) args = {} if region_name == "us-east-1" else { "CreateBucketConfiguration": { "LocationConstraint": region_name } } try: bucket.create(**args) except resource.meta.client.exceptions.BucketAlreadyOwnedByYou as err: _logger.debug("Bucket %s already exists.", err.response["Error"]["BucketName"]) except botocore.exceptions.ClientError as err: if err.response["Error"]["Code"] == "OperationAborted": _logger.debug( "A conflicting conditional operation is currently in progress against this resource." ) bucket.wait_until_exists() return path
def _list( func_name: str, attr_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs: Any, ) -> List[Dict[str, Any]]: session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) client: boto3.client = _utils.client(service_name="quicksight", session=session) func: Callable[..., Dict[str, Any]] = getattr(client, func_name) response: Dict[str, Any] = func(AwsAccountId=account_id, **kwargs) next_token: str = response.get("NextToken", None) result: List[Dict[str, Any]] = response[attr_name] while next_token is not None: response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) next_token = response.get("NextToken", None) result += response[attr_name] return result
def create_athena_dataset( name: str, database: Optional[str] = None, table: Optional[str] = None, sql: Optional[str] = None, sql_name: str = "CustomSQL", data_source_name: Optional[str] = None, data_source_arn: Optional[str] = None, import_mode: str = "DIRECT_QUERY", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, logical_table_alias: str = "LogicalTable", rename_columns: Optional[Dict[str, str]] = None, cast_columns_types: Optional[Dict[str, str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create a QuickSight dataset. Note ---- You will not be able to see the the dataset in the console if you not pass your user to one of the ``allowed_*`` arguments. Note ---- You must pass ``database``/``table`` OR ``sql`` argument. Note ---- You must pass ``data_source_name`` OR ``data_source_arn`` argument. Parameters ---------- name : str Dataset name. database : str Athena's database name. table : str Athena's table name. sql : str Use a SQL query to define your table. sql_name : str Query name. data_source_name : str, optional QuickSight data source name. data_source_arn : str, optional QuickSight data source ARN. import_mode : str Indicates whether you want to import the data into SPICE. 'SPICE'|'DIRECT_QUERY' tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["john", "Mary"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] logical_table_alias : str A display name for the logical table. rename_columns : Dict[str, str], optional Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} cast_columns_types : Dict[str, str], optional Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Dataset ID. Examples -------- >>> import awswrangler as wr >>> dataset_id = wr.quicksight.create_athena_dataset( ... name="...", ... database="..." ... table="..." ... data_source_name="..." ... allowed_to_manage=["Mary"] ... ) """ if (data_source_name is None) and (data_source_arn is None): raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") if (database is not None) and (sql is not None): raise exceptions.InvalidArgument( "If you provide sql argument, please include the database name inside the sql statement." "Do NOT pass in with database argument." ) session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_arn is None) and (data_source_name is not None): data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) if sql is not None: physical_table: Dict[str, Dict[str, Any]] = { "CustomSql": { "DataSourceArn": data_source_arn, "Name": sql_name, "SqlQuery": sql, "Columns": extract_athena_query_columns( sql=sql, data_source_arn=data_source_arn, # type: ignore account_id=account_id, boto3_session=session, ), } } else: physical_table = { "RelationalTable": { "DataSourceArn": data_source_arn, "Schema": database, "Name": table, "InputColumns": extract_athena_table_columns( database=database, # type: ignore table=table, # type: ignore boto3_session=session, ), } } table_uuid: str = uuid.uuid4().hex dataset_id: str = uuid.uuid4().hex args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSetId": dataset_id, "Name": name, "ImportMode": import_mode, "PhysicalTableMap": {table_uuid: physical_table}, "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, } trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( rename_columns=rename_columns, cast_columns_types=cast_columns_types ) if trans: args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="dataset", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_set(**args) return dataset_id
def create_athena_data_source( name: str, workgroup: str = "primary", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a QuickSight data source pointing to an Athena/Workgroup. Note ---- You will not be able to see the the data source in the console if you not pass your user to one of the ``allowed_*`` arguments. Parameters ---------- name : str Data source name. workgroup : str Athena workgroup. tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["John"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.create_athena_data_source( ... name="...", ... allowed_to_manage=["john"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSourceId": name, "Name": name, "Type": "ATHENA", "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}}, "SslProperties": {"DisableSsl": True}, } permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="data_source", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_source(**args)
def _build_cluster_args(**pars): # pylint: disable=too-many-branches,too-many-statements account_id: str = sts.get_account_id(boto3_session=pars["boto3_session"]) region: str = _utils.get_region_from_session( boto3_session=pars["boto3_session"]) # S3 Logging path if pars.get("logging_s3_path") is None: pars["logging_s3_path"] = _get_default_logging_path( subnet_id=None, account_id=account_id, region=region, boto3_session=pars["boto3_session"]) spark_env: Optional[Dict[str, str]] = None yarn_env: Optional[Dict[str, str]] = None livy_env: Optional[Dict[str, str]] = None if pars["spark_pyarrow"] is True: if pars["spark_defaults"] is None: pars["spark_defaults"] = { "spark.sql.execution.arrow.enabled": "true" } else: pars["spark_defaults"][ "spark.sql.execution.arrow.enabled"] = "true" spark_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} yarn_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} livy_env = {"ARROW_PRE_0_15_IPC_FORMAT": "1"} if pars["python3"] is True: if spark_env is None: spark_env = {"PYSPARK_PYTHON": "/usr/bin/python3"} else: spark_env["PYSPARK_PYTHON"] = "/usr/bin/python3" if pars["spark_jars_path"] is not None: paths: str = ",".join(pars["spark_jars_path"]) if pars["spark_defaults"] is None: pars["spark_defaults"] = {"spark.jars": paths} else: pars["spark_defaults"]["spark.jars"] = paths args: Dict[str, Any] = { "Name": pars["cluster_name"], "LogUri": pars["logging_s3_path"], "ReleaseLabel": pars["emr_release"], "VisibleToAllUsers": pars["visible_to_all_users"], "JobFlowRole": pars["emr_ec2_role"], "ServiceRole": pars["emr_role"], "Instances": { "KeepJobFlowAliveWhenNoSteps": pars["keep_cluster_alive_when_no_steps"], "TerminationProtected": pars["termination_protected"], "Ec2SubnetId": pars["subnet_id"], "InstanceFleets": [], }, } # EC2 Key Pair if pars["key_pair_name"] is not None: args["Instances"]["Ec2KeyName"] = pars["key_pair_name"] # Security groups if pars["security_group_master"] is not None: args["Instances"]["EmrManagedMasterSecurityGroup"] = pars[ "security_group_master"] if pars["security_groups_master_additional"] is not None: args["Instances"]["AdditionalMasterSecurityGroups"] = pars[ "security_groups_master_additional"] if pars["security_group_slave"] is not None: args["Instances"]["EmrManagedSlaveSecurityGroup"] = pars[ "security_group_slave"] if pars["security_groups_slave_additional"] is not None: args["Instances"]["AdditionalSlaveSecurityGroups"] = pars[ "security_groups_slave_additional"] if pars["security_group_service_access"] is not None: args["Instances"]["ServiceAccessSecurityGroup"] = pars[ "security_group_service_access"] # Configurations args["Configurations"] = [{ "Classification": "spark-log4j", "Properties": { "log4j.rootCategory": f"{pars['spark_log_level']}, console" } }] if pars["docker"] is True: if pars.get("extra_registries") is None: extra_registries: List[str] = [] else: extra_registries = pars["extra_registries"] registries: str = f"local,centos,{account_id}.dkr.ecr.{region}.amazonaws.com,{','.join(extra_registries)}" registries = registries[:-1] if registries.endswith( ",") else registries args["Configurations"].append({ "Classification": "container-executor", "Properties": {}, "Configurations": [{ "Classification": "docker", "Properties": { "docker.privileged-containers.registries": registries, "docker.trusted.registries": registries, }, "Configurations": [], }], }) if spark_env is not None: args["Configurations"].append({ "Classification": "spark-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": spark_env, "Configurations": [] }], }) if yarn_env is not None: args["Configurations"].append({ "Classification": "yarn-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": yarn_env, "Configurations": [] }], }) if livy_env is not None: args["Configurations"].append({ "Classification": "livy-env", "Properties": {}, "Configurations": [{ "Classification": "export", "Properties": livy_env, "Configurations": [] }], }) if pars["spark_glue_catalog"] is True: args["Configurations"].append({ "Classification": "spark-hive-site", "Properties": { "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" # noqa }, "Configurations": [], }) if pars["hive_glue_catalog"] is True: hive_conf: Optional[Dict[str, Any]] = { "Classification": "hive-site", "Properties": {}, "Configurations": [] } hive_conf["Properties"][ "hive.metastore.client.factory.class"] = "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" args["Configurations"].append(hive_conf) if pars["presto_glue_catalog"] is True: args["Configurations"].append({ "Classification": "presto-connector-hive", "Properties": { "hive.metastore.glue.datacatalog.enabled": "true" }, "Configurations": [], }) if pars["consistent_view"] is True: args["Configurations"].append({ "Classification": "emrfs-site", "Properties": { "fs.s3.consistent.retryPeriodSeconds": str(pars.get("consistent_view_retry_seconds", "10")), "fs.s3.consistent": "true", "fs.s3.consistent.retryCount": str(pars.get("consistent_view_retry_count", "5")), "fs.s3.consistent.metadata.tableName": pars.get("consistent_view_table_name", "EmrFSMetadata"), }, }) if pars["maximize_resource_allocation"] is True: args["Configurations"].append({ "Classification": "spark", "Properties": { "maximizeResourceAllocation": "true" } }) if pars["spark_defaults"] is not None: spark_defaults: Dict[str, Union[str, Dict[str, str]]] = { "Classification": "spark-defaults", "Properties": pars["spark_defaults"], } args["Configurations"].append(spark_defaults) if pars.get("custom_classifications") is not None: for c in pars["custom_classifications"]: args["Configurations"].append(c) # Applications if pars["applications"]: args["Applications"] = [{"Name": x} for x in pars["applications"]] # Bootstraps if pars["bootstraps_paths"]: args["BootstrapActions"] = [{ "Name": x, "ScriptBootstrapAction": { "Path": x } } for x in pars["bootstraps_paths"]] # Debugging and Steps if (pars["debugging"] is True) or (pars["steps"] is not None): args["Steps"] = [] if pars["debugging"] is True: args["Steps"].append({ "Name": "Setup Hadoop Debugging", "ActionOnFailure": "TERMINATE_CLUSTER", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": ["state-pusher-script"] }, }) if pars["steps"] is not None: args["Steps"] += pars["steps"] # Master Instance Fleet timeout_action_master: str = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_master"] else "TERMINATE_CLUSTER" fleet_master: Dict = { "Name": "MASTER", "InstanceFleetType": "MASTER", "TargetOnDemandCapacity": pars["instance_num_on_demand_master"], "TargetSpotCapacity": pars["instance_num_spot_master"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_master"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_master"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_master"], "VolumeType": "gp2" }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_master"] > 0: fleet_master["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_master"], "TimeoutAction": timeout_action_master, } } args["Instances"]["InstanceFleets"].append(fleet_master) # Core Instance Fleet if (pars["instance_num_spot_core"] > 0) or pars["instance_num_on_demand_core"] > 0: timeout_action_core = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_core"] else "TERMINATE_CLUSTER" fleet_core: Dict = { "Name": "CORE", "InstanceFleetType": "CORE", "TargetOnDemandCapacity": pars["instance_num_on_demand_core"], "TargetSpotCapacity": pars["instance_num_spot_core"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_core"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_core"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_core"], "VolumeType": "gp2", }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_core"] > 0: fleet_core["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_core"], "TimeoutAction": timeout_action_core, } } args["Instances"]["InstanceFleets"].append(fleet_core) # Task Instance Fleet if (pars["instance_num_spot_task"] > 0) or pars["instance_num_on_demand_task"] > 0: timeout_action_task: str = "SWITCH_TO_ON_DEMAND" if pars[ "spot_timeout_to_on_demand_task"] else "TERMINATE_CLUSTER" fleet_task: Dict = { "Name": "TASK", "InstanceFleetType": "TASK", "TargetOnDemandCapacity": pars["instance_num_on_demand_task"], "TargetSpotCapacity": pars["instance_num_spot_task"], "InstanceTypeConfigs": [{ "InstanceType": pars["instance_type_task"], "WeightedCapacity": 1, "BidPriceAsPercentageOfOnDemandPrice": pars["spot_bid_percentage_of_on_demand_task"], "EbsConfiguration": { "EbsBlockDeviceConfigs": [{ "VolumeSpecification": { "SizeInGB": pars["instance_ebs_size_task"], "VolumeType": "gp2", }, "VolumesPerInstance": 1, }], "EbsOptimized": True, }, }], } if pars["instance_num_spot_task"] > 0: fleet_task["LaunchSpecifications"] = { "SpotSpecification": { "TimeoutDurationMinutes": pars["spot_provisioning_timeout_task"], "TimeoutAction": timeout_action_task, } } args["Instances"]["InstanceFleets"].append(fleet_task) # Tags if pars["tags"] is not None: args["Tags"] = [{ "Key": k, "Value": v } for k, v in pars["tags"].items()] _logger.debug("args: \n%s", pprint.pformat(args)) return args