async def _delete_security_groups(self): timeout = Timeout( 30, "Unable to delete AWS security group " + self.cluster_name, warn=True ) while timeout.run(): try: await self._clients["ec2"].delete_security_group( GroupName=self.cluster_name, DryRun=False ) except Exception: await asyncio.sleep(2) break
async def _set_address_from_logs(self): timeout = Timeout( 30, "Failed to find %s ip address after 30 seconds." % self.task_type) while timeout.run(): async for line in self.logs(): for query_string in ["worker at:", "Scheduler at:"]: if query_string in line: address = line.split(query_string)[1].strip() if self._use_public_ip: self.external_address = address.replace( self.private_ip, self.public_ip) logger.debug("%s", line) self.address = address return else: if not await self._task_is_running(): raise RuntimeError("%s exited unexpectedly!" % type(self).__name__) continue break
async def start(self): timeout = Timeout(60, "Unable to start %s after 60 seconds" % self.task_type) while timeout.run(): try: kwargs = ( {"tags": dict_to_aws(self.tags)} if await self._is_long_arn_format_enabled() else {} ) # Tags are only supported if you opt into long arn format so we need to check for that response = await self._clients["ecs"].run_task( cluster=self.cluster_arn, taskDefinition=self.task_definition_arn, overrides={ "containerOverrides": [ { "name": "dask-{}".format(self.task_type), "environment": dict_to_aws( self.environment, key_string="name" ), **self._overrides, } ] }, count=1, launchType="FARGATE" if self.fargate else "EC2", networkConfiguration={ "awsvpcConfiguration": { "subnets": self._vpc_subnets, "securityGroups": self._security_groups, "assignPublicIp": "ENABLED" if self._use_public_ip else "DISABLED", } }, **kwargs ) if not response.get("tasks"): raise RuntimeError(response) # print entire response [self.task] = response["tasks"] break except Exception as e: timeout.set_exception(e) await asyncio.sleep(1) self.task_arn = self.task["taskArn"] while self.task["lastStatus"] in ["PENDING", "PROVISIONING"]: await asyncio.sleep(1) await self._update_task() if not await self._task_is_running(): raise RuntimeError("%s failed to start" % type(self).__name__) [eni] = [ attachment for attachment in self.task["attachments"] if attachment["type"] == "ElasticNetworkInterface" ] [network_interface_id] = [ detail["value"] for detail in eni["details"] if detail["name"] == "networkInterfaceId" ] eni = await self._clients["ec2"].describe_network_interfaces( NetworkInterfaceIds=[network_interface_id] ) [interface] = eni["NetworkInterfaces"] if self._use_public_ip: self.public_ip = interface["Association"]["PublicIp"] self.private_ip = interface["PrivateIpAddresses"][0]["PrivateIpAddress"] await self._set_address_from_logs() self.status = "running"
async def create_vm(self): """ https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.run_instances """ # TODO Enable Spot support async with self.cluster.boto_session.create_client( "ec2", region_name=self.region) as client: self.vpc = self.vpc or await get_default_vpc(client) self.subnet_id = (self.subnet_id or (await get_vpc_subnets(client, self.vpc))[0]) self.security_groups = self.security_groups or [ await get_security_group(client, self.vpc) ] self.ami = self.ami or await get_latest_ami_id( client, "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*", "099720109477", # Canonical ) vm_kwargs = { "BlockDeviceMappings": [{ "DeviceName": "/dev/sda1", "VirtualName": "sda1", "Ebs": { "DeleteOnTermination": True, "VolumeSize": self.filesystem_size, "VolumeType": "gp2", "Encrypted": False, }, }], "ImageId": self.ami, "InstanceType": self.instance_type, "MaxCount": 1, "MinCount": 1, "Monitoring": { "Enabled": False }, "UserData": self.cluster.render_process_cloud_init(self), "InstanceInitiatedShutdownBehavior": "terminate", "NetworkInterfaces": [{ "AssociatePublicIpAddress": True, "DeleteOnTermination": True, "Description": "public", "DeviceIndex": 0, "Groups": self.security_groups, "SubnetId": self.subnet_id, }], } if self.key_name: vm_kwargs["KeyName"] = self.key_name if self.iam_instance_profile: vm_kwargs["IamInstanceProfile"] = self.iam_instance_profile if self.availability_zone: if isinstance(self.availability_zone, list): self.availability_zone = random.choice( self.availability_zone) vm_kwargs["Placement"] = { "AvailabilityZone": self.availability_zone } response = await client.run_instances(**vm_kwargs) [self.instance] = response["Instances"] await client.create_tags( Resources=[self.instance["InstanceId"]], Tags=[ { "Key": "Name", "Value": self.name }, { "Key": "Dask Cluster", "Value": self.cluster.uuid }, ], ) self.cluster._log( f"Created instance {self.instance['InstanceId']} as {self.name}" ) timeout = Timeout( 300, f"Failed Public IP for instance {self.instance['InstanceId']}", ) while ("PublicIpAddress" not in self.instance or self.instance["PublicIpAddress"] is None) and timeout.run(): backoff = 0.1 await asyncio.sleep( min(backoff, 10) + backoff % 1 ) # Exponential backoff with a cap of 10 seconds and some jitter try: response = await client.describe_instances( InstanceIds=[self.instance["InstanceId"]], DryRun=False) [reservation] = response["Reservations"] [self.instance] = reservation["Instances"] except botocore.exceptions.ClientError as e: timeout.set_exception(e) backoff = backoff * 2 return self.instance["PublicIpAddress"]