def get_cloudwatch_helper(node_ids): config = load_cloudwatch_example_config_file() config["cluster_name"] = DEFAULT_CLUSTER_NAME return CloudwatchHelper( config["provider"], node_ids, config["cluster_name"], )
def do_update(self): self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH}) cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S self.wait_ready(deadline) global_event_system.execute_callback( CreateClusterEvent.ssh_control_acquired) node_tags = self.provider.node_tags(self.node_id) logger.debug("Node tags: {}".format(str(node_tags))) if self.provider_type == "aws" and self.provider.provider_config: from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper \ import CloudwatchHelper CloudwatchHelper(self.provider.provider_config, [self.node_id], self.provider.cluster_name). \ update_from_config(self.is_head_node) if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: # When resuming from a stopped instance the runtime_hash may be the # same, but the container will not be started. init_required = self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=False) if init_required: node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" # This ensures that `setup_commands` are not removed self.restart_only = False if self.restart_only: self.setup_commands = [] # runtime_hash will only change whenever the user restarts # or updates their cluster with `get_or_create_head_node` if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( not self.file_mounts_contents_hash or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) == self.file_mounts_contents_hash): # todo: we lie in the confirmation message since # full setup might be cancelled here cli_logger.print( "Configuration already up to date, " "skipping file mounts, initalization and setup commands.", _numbered=("[]", "2-6", NUM_SETUP_STEPS)) else: cli_logger.print( "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash)) self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES}) cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) self.sync_file_mounts( self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS)) # Only run setup commands if runtime_hash has changed because # we don't want to run setup_commands every time the head node # file_mounts folders have changed. if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: # Run init commands self.provider.set_node_tags( self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP}) cli_logger.labeled_value("New status", STATUS_SETTING_UP) if self.initialization_commands: with cli_logger.group( "Running initialization commands", _numbered=("[]", 4, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd) with LogTimer( self.log_prefix + "Initialization commands", show_status=True): for cmd in self.initialization_commands: global_event_system.execute_callback( CreateClusterEvent.run_initialization_cmd, {"command": cmd}) try: # Overriding the existing SSHOptions class # with a new SSHOptions class that uses # this ssh_private_key as its only __init__ # argument. # Run outside docker. self.cmd_runner.run( cmd, ssh_options_override_ssh_key=self. auth_config.get("ssh_private_key"), run_env="host") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Initialization command failed." ) from None else: cli_logger.print( "No initialization commands to run.", _numbered=("[]", 4, NUM_SETUP_STEPS)) with cli_logger.group( "Initalizing command runner", # todo: fix command numbering _numbered=("[]", 5, NUM_SETUP_STEPS)): self.cmd_runner.run_init( as_head=self.is_head_node, file_mounts=self.file_mounts, sync_run_yet=True) if self.setup_commands: with cli_logger.group( "Running setup commands", # todo: fix command numbering _numbered=("[]", 6, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd) with LogTimer( self.log_prefix + "Setup commands", show_status=True): total = len(self.setup_commands) for i, cmd in enumerate(self.setup_commands): global_event_system.execute_callback( CreateClusterEvent.run_setup_cmd, {"command": cmd}) if cli_logger.verbosity == 0 and len(cmd) > 30: cmd_to_print = cf.bold(cmd[:30]) + "..." else: cmd_to_print = cf.bold(cmd) cli_logger.print( "{}", cmd_to_print, _numbered=("()", i, total)) try: # Runs in the container if docker is in use self.cmd_runner.run(cmd, run_env="auto") except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error( "See above for stderr.") raise click.ClickException( "Setup command failed.") else: cli_logger.print( "No setup commands to run.", _numbered=("[]", 6, NUM_SETUP_STEPS)) with cli_logger.group( "Starting the Ray runtime", _numbered=("[]", 7, NUM_SETUP_STEPS)): global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime) with LogTimer( self.log_prefix + "Ray start commands", show_status=True): for cmd in self.ray_start_commands: # Add a resource override env variable if needed: if self.provider_type == "local": # Local NodeProvider doesn't need resource override. env_vars = {} elif self.node_resources: env_vars = { RESOURCES_ENVIRONMENT_VARIABLE: self.node_resources } else: env_vars = {} try: old_redirected = cmd_output_util.is_output_redirected() cmd_output_util.set_output_redirected(False) # Runs in the container if docker is in use self.cmd_runner.run( cmd, environment_variables=env_vars, run_env="auto") cmd_output_util.set_output_redirected(old_redirected) except ProcessRunnerError as e: if e.msg_type == "ssh_command_failed": cli_logger.error("Failed.") cli_logger.error("See above for stderr.") raise click.ClickException("Start command failed.") global_event_system.execute_callback( CreateClusterEvent.start_ray_runtime_completed)
def _create_node(self, node_config, tags, count): created_nodes_dict = {} tags = to_aws_format(tags) conf = node_config.copy() tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) if CloudwatchHelper.cloudwatch_config_exists(self.provider_config, "agent"): cwa_installed = self._check_ami_cwa_installation(node_config) if cwa_installed: tag_pairs.extend([{ "Key": CLOUDWATCH_AGENT_INSTALLED_TAG, "Value": "True", }]) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) AWSNodeProvider._merge_tag_specs(tag_specs, user_tag_specs) # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") # update config with min/max node counts and tag specs conf.update({ "MinCount": 1, "MaxCount": count, "TagSpecifications": tag_specs }) # Try to always launch in the first listed subnet. subnet_idx = 0 cli_logger_tags = {} # NOTE: This ensures that we try ALL availability zones before # throwing an error. max_tries = max(BOTO_CREATE_MAX_RETRIES, len(subnet_ids)) for attempt in range(1, max_tries + 1): try: if "NetworkInterfaces" in conf: net_ifs = conf["NetworkInterfaces"] # remove security group IDs previously copied from network # interfaces (create_instances call fails otherwise) conf.pop("SecurityGroupIds", None) cli_logger_tags["network_interfaces"] = str(net_ifs) else: subnet_id = subnet_ids[subnet_idx % len(subnet_ids)] conf["SubnetId"] = subnet_id cli_logger_tags["subnet_id"] = subnet_id created = self.ec2_fail_fast.create_instances(**conf) created_nodes_dict = {n.id: n for n in created} # todo: timed? # todo: handle plurality? with cli_logger.group( "Launched {} nodes", count, _tags=cli_logger_tags): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print( "Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) break except botocore.exceptions.ClientError as exc: if attempt == max_tries: cli_logger.abort( "Failed to launch instances. Max attempts exceeded.", exc=exc, ) else: cli_logger.warning( "create_instances: Attempt failed with {}, retrying.", exc) # Launch failure may be due to instance type availability in # the given AZ subnet_idx += 1 return created_nodes_dict
def _configure_iam_role(config): head_node_type = config["head_node_type"] head_node_config = config["available_node_types"][head_node_type]["node_config"] if "IamInstanceProfile" in head_node_config: _set_config_info(head_instance_profile_src="config") return config _set_config_info(head_instance_profile_src="default") instance_profile_name = cwh.resolve_instance_profile_name( config["provider"], DEFAULT_RAY_INSTANCE_PROFILE, ) profile = _get_instance_profile(instance_profile_name, config) if profile is None: cli_logger.verbose( "Creating new IAM instance profile {} for use as the default.", cf.bold(instance_profile_name), ) client = _client("iam", config) client.create_instance_profile(InstanceProfileName=instance_profile_name) profile = _get_instance_profile(instance_profile_name, config) time.sleep(15) # wait for propagation cli_logger.doassert( profile is not None, "Failed to create instance profile." ) # todo: err msg assert profile is not None, "Failed to create instance profile" if not profile.roles: role_name = cwh.resolve_iam_role_name(config["provider"], DEFAULT_RAY_IAM_ROLE) role = _get_role(role_name, config) if role is None: cli_logger.verbose( "Creating new IAM role {} for use as the default instance role.", cf.bold(role_name), ) iam = _resource("iam", config) policy_doc = { "Statement": [ { "Effect": "Allow", "Principal": {"Service": "ec2.amazonaws.com"}, "Action": "sts:AssumeRole", }, ] } attach_policy_arns = cwh.resolve_policy_arns( config["provider"], iam, [ "arn:aws:iam::aws:policy/AmazonEC2FullAccess", "arn:aws:iam::aws:policy/AmazonS3FullAccess", ], ) iam.create_role( RoleName=role_name, AssumeRolePolicyDocument=json.dumps(policy_doc) ) role = _get_role(role_name, config) cli_logger.doassert( role is not None, "Failed to create role." ) # todo: err msg assert role is not None, "Failed to create role" for policy_arn in attach_policy_arns: role.attach_policy(PolicyArn=policy_arn) profile.add_role(RoleName=role.name) time.sleep(15) # wait for propagation # Add IAM role to "head_node" field so that it is applied only to # the head node -- not to workers with the same node type as the head. config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn} return config
def _create_node(self, node_config, tags, count): created_nodes_dict = {} tags = to_aws_format(tags) conf = node_config.copy() tag_pairs = [{ "Key": TAG_RAY_CLUSTER_NAME, "Value": self.cluster_name, }] for k, v in tags.items(): tag_pairs.append({ "Key": k, "Value": v, }) tag_specs = [{ "ResourceType": "instance", "Tags": tag_pairs, }] user_tag_specs = conf.get("TagSpecifications", []) # SubnetIds is not a real config key: we must resolve to a # single SubnetId before invoking the AWS API. subnet_ids = conf.pop("SubnetIds") # Allow users to add tags and override values of existing # tags with their own. This only applies to the resource type # "instance". All other resource types are appended to the list of # tag specs. for user_tag_spec in user_tag_specs: if user_tag_spec["ResourceType"] == "instance": for user_tag in user_tag_spec["Tags"]: exists = False for tag in tag_specs[0]["Tags"]: if user_tag["Key"] == tag["Key"]: exists = True tag["Value"] = user_tag["Value"] break if not exists: tag_specs[0]["Tags"] += [user_tag] else: tag_specs += [user_tag_spec] # update config with min/max node counts and tag specs conf.update({ "MinCount": 1, "MaxCount": count, "TagSpecifications": tag_specs }) cli_logger_tags = {} for attempt in range(1, BOTO_CREATE_MAX_RETRIES + 1): try: if "NetworkInterfaces" in conf: net_ifs = conf["NetworkInterfaces"] # remove security group IDs previously copied from network # interfaces (create_instances call fails otherwise) conf.pop("SecurityGroupIds") cli_logger_tags["network_interfaces"] = str(net_ifs) else: subnet_id = subnet_ids[self.subnet_idx % len(subnet_ids)] self.subnet_idx += 1 conf["SubnetId"] = subnet_id cli_logger_tags["subnet_id"] = subnet_id created = self.ec2_fail_fast.create_instances(**conf) created_nodes_dict = {n.id: n for n in created} # todo: timed? # todo: handle plurality? with cli_logger.group("Launched {} nodes", count, _tags=cli_logger_tags): for instance in created: # NOTE(maximsmol): This is needed for mocking # boto3 for tests. This is likely a bug in moto # but AWS docs don't seem to say. # You can patch moto/ec2/responses/instances.py # to fix this (add <stateReason> to EC2_RUN_INSTANCES) # The correct value is technically # {"code": "0", "Message": "pending"} state_reason = instance.state_reason or { "Message": "pending" } cli_logger.print("Launched instance {}", instance.instance_id, _tags=dict( state=instance.state["Name"], info=state_reason["Message"])) break except botocore.exceptions.ClientError as exc: if attempt == BOTO_CREATE_MAX_RETRIES: # todo: err msg cli_logger.abort( "Failed to launch instances. Max attempts exceeded.") raise exc else: cli_logger.print( "create_instances: Attempt failed with {}, retrying.", exc) return created_nodes_dict # TODO: Idempotently correct CloudWatch setup errors on cached nodes? node_ids = [n.id for n in created] CloudwatchHelper(self.provider_config, node_ids, self.cluster_name).\ setup_from_config() # TODO: Idempotently correct CloudWatch setup errors on cached nodes? node_ids = [n.id for n in created] CloudwatchHelper(self.provider_config, node_ids, self.cluster_name).\ setup_from_config()
def _configure_iam_role(config): if "IamInstanceProfile" in config["head_node"]: _set_config_info(head_instance_profile_src="config") return config _set_config_info(head_instance_profile_src="default") instance_profile_name = cwh.resolve_instance_profile_name( config, DEFAULT_RAY_INSTANCE_PROFILE, ) profile = _get_instance_profile(instance_profile_name, config) if profile is None: cli_logger.verbose( "Creating new IAM instance profile {} for use as the default.", cf.bold(DEFAULT_RAY_INSTANCE_PROFILE)) client = _client("iam", config) client.create_instance_profile( InstanceProfileName=DEFAULT_RAY_INSTANCE_PROFILE) profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config) time.sleep(15) # wait for propagation cli_logger.doassert(profile is not None, "Failed to create instance profile.") # todo: err msg assert profile is not None, "Failed to create instance profile" if not profile.roles: role_name = cwh.resolve_iam_role_name(config, DEFAULT_RAY_IAM_ROLE) role = _get_role(role_name, config) if role is None: cli_logger.verbose( "Creating new IAM role {} for " "use as the default instance role.", cf.bold(DEFAULT_RAY_IAM_ROLE)) iam = _resource("iam", config) policy_doc = { "Statement": [ { "Effect": "Allow", "Principal": { "Service": "ec2.amazonaws.com" }, "Action": "sts:AssumeRole", }, ] } attach_policy_arns = cwh.resolve_policy_arns( config, [ "arn:aws:iam::aws:policy/AmazonEC2FullAccess", "arn:aws:iam::aws:policy/AmazonS3FullAccess" ]) iam.create_role(RoleName=role_name, AssumeRolePolicyDocument=json.dump(policy_doc)) role = _get_role(role_name, config) cli_logger.doassert(role is not None, "Failed to create role.") # todo: err msg assert role is not None, "Failed to create role" for policy_arn in attach_policy_arns: role.attach_policy(PolicyArn=policy_arn) profile.add_role(RoleName=role.name) time.sleep(15) # wait for propagation config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn} return config