def grow_ebs_for_task(task_fragment, target_size_gb): """Grows EBS volume for given task.""" ec2 = u.create_ec2_resource() client = u.create_ec2_client() # todo: don't crash on missing/duplicate names instances = {u.get_name(i.tags): i for i in ec2.instances.all()} ec2 = u.create_ec2_resource() instances = [(u.seconds_from_datetime(i.launch_time), i) for i in ec2.instances.all()] sorted_instances = sorted(instances, key=itemgetter(0)) for (seconds, instance) in sorted_instances: task_name = u.get_name(instance.tags) hours_ago = (time.time() - seconds) / 3600 hours_ago += 8 # adjust for time being in UTC if task_fragment in task_name: print("Found instance %s launched %.1f hours ago" % (task_name, hours_ago)) break print(instance.id) volumes = list(instance.volumes.all()) assert len(volumes) == 1, "Must have 1 volume" print("Growing %s to %s" % (volumes[0].id, target_size_gb)) response = client.modify_volume( VolumeId=volumes[0].id, Size=target_size_gb, ) assert u.is_good_response(response)
def keypair_setup(): """Creates keypair if necessary, saves private key locally, returns contents of private key file.""" existing_keypairs = u.get_keypair_dict() keypair = existing_keypairs.get(KEYPAIR_NAME, None) keypair_fn = u.get_keypair_fn(KEYPAIR_NAME) if keypair: print("Reusing keypair "+KEYPAIR_NAME) # check that local pem file exists and is readable assert os.path.exists(keypair_fn) keypair_contents = open(keypair_fn).read() assert len(keypair_contents)>0 # todo: check that fingerprint matches keypair.key_fingerprint return keypair print("Creating keypair "+KEYPAIR_NAME) ec2 = u.create_ec2_resource() keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME) assert not os.path.exists(keypair_fn), "previous, keypair exists, delete it with 'sudo rm %s'"%(keypair_fn) open(keypair_fn, 'w').write(keypair.key_material) os.system('chmod 400 '+keypair_fn) return keypair
def keypair_setup(): """Creates keypair if necessary, saves private key locally, returns contents of private key file.""" os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION) keypair = u.get_keypair_dict().get(KEYPAIR_NAME, None) keypair_fn = u.get_keypair_fn() if keypair: print("Reusing keypair " + KEYPAIR_NAME) # check that local pem file exists and is readable assert os.path.exists( keypair_fn ), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % ( KEYPAIR_NAME, keypair_fn, KEYPAIR_NAME) keypair_contents = open(keypair_fn).read() assert len(keypair_contents) > 0 # todo: check that fingerprint matches keypair.key_fingerprint else: print("Creating keypair " + KEYPAIR_NAME) ec2 = u.create_ec2_resource() assert not os.path.exists( keypair_fn ), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % ( keypair_fn) keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME) open(keypair_fn, 'w').write(keypair.key_material) os.system('chmod 400 ' + keypair_fn) return keypair
def main(): ec2 = u.create_ec2_resource() assert not args.snapshot, "Switched to snapshot_desc" if not args.zone: assert 'zone' in os.environ, 'must specify --zone or $zone' args.zone = os.environ['zone'] snapshots = [] for snap in ec2.snapshots.filter(OwnerIds=['self']): if args.snapshot_desc in snap.description: snapshots.append(snap) assert len(snapshots)>0, f"no snapshot matching {args.snapshot_desc}" assert len(snapshots)<2, f"multiple snapshots matching {args.snapshot_desc}" snap = snapshots[0] if not args.size_gb: args.size_gb = snap.volume_size print(f"Making {args.replicas} {args.size_gb} GB replicas in {args.zone}") for i in range(args.volume_offset, args.replicas+args.volume_offset): vol_name = 'imagenet_%02d'%(i) vol = ec2.create_volume(Size=args.size_gb, VolumeType='io1', TagSpecifications=create_tags(vol_name), AvailabilityZone=args.zone, SnapshotId=snap.id, Iops=args.iops) print(f"Creating {vol_name} {vol.id}")
def main(): ec2 = u.create_ec2_resource() # ec2 resource ec2_client = u.create_ec2_client() # ec2 client instances = list(ec2.instances.all()) # todo: use filter? region = u.get_region() instances_to_kill = [] for i in instances: name = u.get_name(i.tags) state = i.state['Name'] if not fragment in name: continue if args.skip_tensorboard and '.tb.' in name: continue if args.skip_stopped and state == 'stopped': continue if args.limit_to_key and not (USER_KEY_NAME in i.key_name): continue if state == 'terminated': continue instances_to_kill.append(i) print(u.get_name(i), i.instance_type, i.key_name, state if state == 'stopped' else '') # print extra info if couldn't find anything to kill if not instances_to_kill: valid_names = sorted( list( set("%s,%s" % (u.get_name(i), u.get_state(i)) for i in instances))) from pprint import pprint as pp print("Current instances:") pp(valid_names) print("No running instances found for: Name '%s', key '%s'" % (fragment, USER_KEY_NAME)) if args.skip_tensorboard: print("skipping tensorboard") return action = 'soft terminate' if args.soft else 'terminate' if args.yes: answer = 'y' else: answer = input("%d instances found, %s in %s? (y/N) " % (len(instances_to_kill), action, region)) if not answer: answer = "n" if answer.lower() == "y" or args.yes: instance_ids = [i.id for i in instances_to_kill] if args.delay: print(f"Sleeping for {args.delay} seconds") time.sleep(args.delay) if args.soft: response = ec2_client.stop_instances(InstanceIds=instance_ids) print("soft terminating, got response: %s", response) else: response = ec2_client.terminate_instances(InstanceIds=instance_ids) print("terminating, got response: %s", response) else: print("Didn't get y, doing nothing")
def list_spot_requests(): ec2 = u.create_ec2_resource() client = u.create_ec2_client() for request in client.describe_spot_instance_requests( )['SpotInstanceRequests']: launch_spec = request['LaunchSpecification'] print(request['SpotInstanceRequestId'], launch_spec['InstanceType'], launch_spec['KeyName'], request['State'])
def attach_instance_ebs(aws_instance, tag, unix_device=u.DEFAULT_UNIX_DEVICE): """Attaches volume to instance. Will try to detach volume if it's already mounted somewhere else. Will retry indefinitely on error.""" ec2 = u.create_ec2_resource() v = list( ec2.volumes.filter(Filters=[{ 'Name': 'tag:Name', 'Values': [tag] }, { "Name": "availability-zone", 'Values': [os.environ['zone']] }]).all()) assert (v), f"Volume {tag} not found." v = v[0] volume_name = u.get_name(v) already_attached = v.attachments and v.attachments[0][ 'InstanceId'] == aws_instance.id instance_name = u.get_name(aws_instance) # TODO: still have edge case when it doesn't report as already attached # and keeps trying to attach to an instance that has data mounted already if already_attached: print( f'volume {volume_name} ({v.id}) already attached to {instance_name}' ) return while v.state != 'available': response = v.detach_from_instance() instance_id = v.attachments[0]['InstanceId'] instance_name = u.get_name(instance_id) print( f'Volume {tag} is attached to {instance_name}, detaching, response={response.get("State", "none")}' ) time.sleep(ATTACH_WAIT_INTERVAL_SEC) v.reload() while True: try: response = v.attach_to_instance(InstanceId=aws_instance.id, Device=unix_device) print( f'Attaching {volume_name} to {instance_name}: response={response.get("State", "none")}' ) # sometimes have unrecoverable failure on brand new instance with # possibly because of https://forums.aws.amazon.com/thread.jspa?threadID=66192 # Error attaching volume: (An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use). Retrying in 5 An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use except Exception as e: print(f"Failed attaching ({v.id}) to ({aws_instance.id})") print( f'Error attaching volume: ({e}). Retrying in {ATTACH_WAIT_INTERVAL_SEC}', e) time.sleep(ATTACH_WAIT_INTERVAL_SEC) continue else: print('Attachment successful') break
def main(): fragment = args.fragment # TODO: prevent CTRL+c/CTRL+d from killing session if not args.skip_tmux: print("Launching into TMUX session, use CTRL+b d to exit") region = u.get_region() client = u.create_ec2_client() ec2 = u.create_ec2_resource() response = client.describe_instances() username = os.environ.get("USERNAME", "ubuntu") print("Using username '%s'" % (username, )) instance_list = [] for instance in ec2.instances.all(): if instance.state['Name'] != 'running': continue name = u.get_name(instance.tags) if (fragment in name or fragment in str(instance.public_ip_address) or fragment in str(instance.id) or fragment in str(instance.private_ip_address)): instance_list.append((u.toseconds(instance.launch_time), instance)) from tzlocal import get_localzone # $ pip install tzlocal filtered_instance_list = u.get_instances(fragment) if not filtered_instance_list: print("no instance id contains fragment '%s'" % (fragment, )) return # connect to most recent instance print(filtered_instance_list) instance = filtered_instance_list[0] print("Connecting to ", u.get_name(instance), " launched ", instance.launch_time.astimezone(get_localzone())) cmd = '' keypair_fn = u.get_keypair_fn() cmd = make_cmd(keypair_fn, username, instance.public_ip_address) print(cmd) result = os.system(cmd) if username == 'ubuntu': username = '******' elif username == 'ec2-user': username = '******' if result != 0: print("ssh failed with code %d, trying username %s" % (result, username)) cmd = make_cmd(keypair_fn, username, instance.public_ip_address) os.system(cmd)
def main(): if len(sys.argv) < 2: mode = 'list' else: mode = sys.argv[1] if mode == 'list': list_vpcs() elif mode == 'delete': assert len(sys.argv) == 3 assert 'AWS_DEFAULT_REGION' in os.environ client = u.create_ec2_client() ec2 = u.create_ec2_resource() response = client.describe_vpcs() for vpc_response in response['Vpcs']: vpc_name = _get_name(vpc_response.get('Tags', [])) vpc = ec2.Vpc(vpc_response['VpcId']) if vpc_name == sys.argv[2] or vpc.id == sys.argv[2]: print("Deleting VPC name=%s, id=%s" % (vpc_name, vpc.id)) for subnet in vpc.subnets.all(): print("Deleting subnet %s" % (subnet.id)) assert u.is_good_response(subnet.delete()) for gateway in vpc.internet_gateways.all(): print("Deleting gateway %s" % (gateway.id)) assert u.is_good_response( gateway.detach_from_vpc(VpcId=vpc.id)) assert u.is_good_response(gateway.delete()) for security_group in vpc.security_groups.all(): try: assert u.is_good_response(security_group.delete()) except Exception as e: print("Failed with " + str(e)) for route_table in vpc.route_tables.all(): print("Deleting route table %s" % (route_table.id)) try: assert u.is_good_response(route_table.delete()) except Exception as e: print("Failed with " + str(e)) if u.is_good_response(client.delete_vpc(VpcId=vpc.id)): print("Succeeded deleting VPC ", vpc.id)
def placement_group_setup(group_name): """Creates placement group if necessary. Returns True if new placement group was created, False otherwise.""" existing_placement_groups = u.get_placement_group_dict() group = existing_placement_groups.get(group_name, None) if group: assert group.state == 'available' assert group.strategy == 'cluster' print("Reusing group ", group.name) return group print("Creating group "+group_name) ec2 = u.create_ec2_resource() group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster') return group
def cancel_spot_requests(): ec2 = u.create_ec2_resource() client = u.create_ec2_client() for request in client.describe_spot_instance_requests( )['SpotInstanceRequests']: state = request['State'] if state == 'cancelled' or state == 'closed': continue launch_spec = request['LaunchSpecification'] print('cancelling', request['SpotInstanceRequestId'], launch_spec['InstanceType'], launch_spec['KeyName'], request['State']) client.cancel_spot_instance_requests( SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
def get_instance(fragment): ec2 = u.create_ec2_resource() instances = [(u.seconds_from_datetime(i.launch_time), i) for i in ec2.instances.all()] # latest instance first sorted_instances = reversed(sorted(instances, key=itemgetter(0))) for (seconds, instance) in sorted_instances: name = u.get_name(instance.tags) if fragment in u.get_name(instance.tags): hours_ago = (time.time() - seconds) / 3600 hours_ago += 8 # adjust for time being in UTC print("Found instance %s launched %.1f hours ago" % ( name, hours_ago, )) return instance print("Found nothing matching", fragment)
def list_instances(): ec2 = u.create_ec2_resource() instances = [(u.seconds_from_datetime(i.launch_time), i) for i in ec2.instances.all()] sorted_instances = sorted(instances, key=itemgetter(0)) for (seconds, instance) in sorted_instances: hours_ago = (time.time() - seconds) / 3600 hours_ago += 8 # adjust for time being in UTC if instance.state['Name'] != 'running': continue if not (LIMIT_TO_KEY in instance.key_name): continue print("%4s %20s %10s %20s %s %s" % (int(hours_ago), u.get_name(instance.tags), instance.instance_type, instance.public_ip_address, instance.private_ip_address, instance.id))
def attach_instance_ebs(aws_instance, tag, unix_device=u.DEFAULT_UNIX_DEVICE): """Attaches volume to instance. Will try to detach volume if it's already mounted somewhere else. Will retry indefinitely on error.""" ec2 = u.create_ec2_resource() v = list( ec2.volumes.filter(Filters=[{ 'Name': 'tag:Name', 'Values': [tag] }]).all()) assert (v), f"Volume {tag} not found." v = v[0] already_attached = v.attachments and v.attachments[0][ 'InstanceId'] == aws_instance.id if already_attached: print(f'volume {v} already attached') return if v.state != 'available': response = v.detach_from_instance() print( f'Detaching from current instance: response={response.get("State", "none")}' ) while True: try: response = v.attach_to_instance(InstanceId=aws_instance.id, Device=unix_device) print( f'Attaching to current instance: response={response.get("State", "none")}' ) # sometimes have unrecoverable failure on brand new instance with # possibly because of https://forums.aws.amazon.com/thread.jspa?threadID=66192 # Error attaching volume: (An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use). Retrying in 5 An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use except Exception as e: print(f"Failed attaching ({v.id}) to ({aws_instance.id})") print( f'Error attaching volume: ({e}). Retrying in {ATTACH_WAIT_INTERVAL_SEC}', e) time.sleep(ATTACH_WAIT_INTERVAL_SEC) continue else: print('Attachment successful') break
def list_ebss(): """Print list of instances with their attached volume id/size to console, ie master-us-east-1a.masters.df86c4e8-pachydermcluster.kubernetes.com: vol-0f0e841d0cc657002 (20),vol-06fb03280cf2598fb (20),vol-0e7ef0896b234db53 (64) nodes.df86c4e8-pachydermcluster.kubernetes.com: vol-012367900cd8dae8c (128) nodes.df86c4e8-pachydermcluster.kubernetes.com: vol-0a98ee5f7f155b2b7 (128),vol-048e29f604d2900a7 (100) imagenet: vol-024347797a6ab11e8 (1500) api_service_prod: vol-0c36c9f21bb6be8a6 (8) box00.gpubox.0: vol-0c69c68295a89cde5 (50) """ ec2 = u.create_ec2_resource() instances = [(u.seconds_from_datetime(i.launch_time), i) for i in ec2.instances.all()] sorted_instances = sorted(instances, key=itemgetter(0)) for (seconds, instance) in sorted_instances: volumes = instance.volumes.all() volume_strs = [] for v in volumes: volume_strs.append("%s (%s)" % (v.id, v.size)) print("%s: %s" % (u.get_name(instance.tags), ','.join(volume_strs)))
def main(): ec2 = u.create_ec2_resource() assert 'ZONE' in os.environ zone = os.environ['ZONE'] snapshots = [] # filtering by name doesn't work, Tags are somehow not public? # https://stackoverflow.com/questions/51887270/how-to-make-snapshot-tags-public # snapshots = list(ec2.snapshots.filter(Filters=[{'Name':'tag:Name', 'Values':[args.snapshot]}])) # use filtering by description instead snapshots = list( ec2.snapshots.filter(Filters=[{ 'Name': 'description', 'Values': [args.snapshot] }, { 'Name': 'owner-id', 'Values': [args.snapshot_account] }])) assert len(snapshots) > 0, f"no snapshot matching {args.snapshot}" assert len(snapshots) < 2, f"multiple snapshots matching {args.snapshot}" snap = snapshots[0] if not args.size_gb: args.size_gb = snap.volume_size print(f"Making {args.replicas} {args.size_gb} GB replicas in {zone}") for i in range(args.volume_offset, args.replicas + args.volume_offset): vol_name = 'imagenet_%02d' % (i) vol = ec2.create_volume(Size=args.size_gb, VolumeType='io1', TagSpecifications=create_tags(vol_name), AvailabilityZone=zone, SnapshotId=snap.id, Iops=args.iops) print(f"Creating {vol_name} {vol.id}")
def list_ebss(): """.""" ec2 = u.create_ec2_resource() volumes = list(ec2.volumes.all()) for vol in volumes: if args.io1 and vol.volume_type != 'io1': continue vol_name = u.get_name(vol) if not vol_name: vol_name = vol.id attached_to = [] if vol.attachments: for attachment in vol.attachments: instance_id = attachment["InstanceId"] instance = ec2.Instance(instance_id) attached_to.append(u.get_name(instance)) else: attached_to.append('<unattached>') print("%25s %s %s %s" % (vol_name, vol.availability_zone, attached_to, vol.id))
def make_job(self, role_name, num_tasks=1, skip_existing_job_validation=False, **kwargs): """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested.""" # u.maybe_create_resources() assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instance_type = kwargs['instance_type'] instances = u.lookup_aws_instances(job_name, instance_type=instance_type) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs.get('ami', '') ami_name = kwargs.get('ami_name', '') availability_zone = kwargs.get('availability_zone', '') if not availability_zone: availability_zone = os.environ['ZONE'] placement_group = kwargs.get('placement_group', '') # automatically generated placement_group_name use_placement_group = kwargs.get('use_placement_group', False) assert use_placement_group == False or placement_group == '' if use_placement_group: placement_group = self.placement_group_name install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') # TODO: use heuristics to tell linux type from AMI name user_data = kwargs.get('user_data', '') if user_data: assert user_data.startswith('#!/bin/bash') ebs = kwargs.get('ebs', '') use_spot = kwargs.get('use_spot', False) monitoring = kwargs.get('monitoring', True) # always install tmux on Amazon linux types # TODO: has no effect for some reason # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en if linux_type == 'amazon': user_data += 'sudo yum install tmux -y' if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: if not skip_existing_job_validation: assert len(instances) == num_tasks, ( "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually." % (job_name, instances[0].state, len(instances), num_tasks)) print("Found existing job " + job_name) starting_instances = False for i in instances: if i.state['Name'] == 'stopped': i.start() starting_instances = True # TODO: replace with proper wait loop if starting_instances: while True: print("Waiting forever for instances to start") time.sleep(10) print(instances) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) assert not ( ami and ami_name ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name assert ami or ami_name, "Must specify at least one of ami and ami_name" if ami_name: ami = u.lookup_ami_id(ami_name).id security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % ( availability_zone, region, ', '.join( subnet_dict.keys()), availability_zone[:-1]) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # storage setup if ebs: args['BlockDeviceMappings'] = ebs # network setup # TODO: get rid of zone? Zone seems to be required for constructor # that allows to enable AssociatePublicIpAddress field args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg if monitoring: args['Monitoring'] = {'Enabled': True} args['UserData'] = user_data if use_spot: instances = u.create_spot_instances(args) else: try: instances = ec2.create_instances(**args) except Exception as e: print(f"Instance creation failed with ({e})") print("Account number: ", u.get_account_number()) print("Region: ", u.get_region()) sys.exit() assert instances assert len(instances) == num_tasks # TODO: make instances match their launch indices. This way # tasks can figure out which # they are for (task_num, instance) in enumerate(instances): while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" # task_name = u.format_task_name(instance.ami_launch_index, role_name, # self.name) task_name = u.format_task_name(task_num, job_name) instance.create_tags(Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def network_setup(): """Creates VPC if it doesn't already exists, configures it for public internet access, returns vpc, subnet, security_group""" # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 ec2 = u.create_ec2_resource() existing_vpcs = u.get_vpc_dict() zones = u.get_available_zones() if VPC_NAME in existing_vpcs: print("Reusing VPC " + VPC_NAME) vpc = existing_vpcs[VPC_NAME] subnets = list(vpc.subnets.all()) assert len(subnets) == len( zones ), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( len(subnets), len(zones)) else: print("Creating VPC " + VPC_NAME) vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') # enable DNS on the VPC response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) assert u.is_good_response(response) response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) assert u.is_good_response(response) vpc.create_tags(Tags=u.make_name(VPC_NAME)) vpc.wait_until_available() gateways = u.get_gateway_dict(vpc) if DEFAULT_NAME in gateways: print("Reusing gateways " + DEFAULT_NAME) else: print("Creating gateway " + DEFAULT_NAME) ig = ec2.create_internet_gateway() ig.attach_to_vpc(VpcId=vpc.id) ig.create_tags(Tags=u.make_name(DEFAULT_NAME)) # check that attachment succeeded # TODO: sometimes get # AssertionError: vpc vpc-33d0804b is in state None attach_state = u.get1(ig.attachments, State=-1, VpcId=vpc.id) assert attach_state == 'available', "vpc %s is in state %s" % ( vpc.id, attach_state) route_table = vpc.create_route_table() route_table.create_tags(Tags=u.make_name(ROUTE_TABLE_NAME)) dest_cidr = '0.0.0.0/0' route = route_table.create_route(DestinationCidrBlock=dest_cidr, GatewayId=ig.id) # check success for route in route_table.routes: # result looks like this # ec2.Route(route_table_id='rtb-a8b438cf', # destination_cidr_block='0.0.0.0/0') if route.destination_cidr_block == dest_cidr: break else: # sometimes get # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] # TODO: add a wait/retry? assert False, "Route for %s not found in %s" % (dest_cidr, route_table.routes) assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 ip = 0 for zone in zones: cidr_block = '192.168.%d.0/20' % (ip, ) ip += 16 print("Creating subnet %s in zone %s" % (cidr_block, zone)) subnet = vpc.create_subnet(CidrBlock=cidr_block, AvailabilityZone=zone) subnet.create_tags(Tags=[{ 'Key': 'Name', 'Value': f'{VPC_NAME}-subnet' }, { 'Key': 'Region', 'Value': zone }]) u.wait_until_available(subnet) route_table.associate_with_subnet(SubnetId=subnet.id) # Creates security group if necessary existing_security_groups = u.get_security_group_dict() if SECURITY_GROUP_NAME in existing_security_groups: print("Reusing security group " + SECURITY_GROUP_NAME) security_group = existing_security_groups[SECURITY_GROUP_NAME] else: print("Creating security group " + SECURITY_GROUP_NAME) security_group = ec2.create_security_group( GroupName=SECURITY_GROUP_NAME, Description=SECURITY_GROUP_NAME, VpcId=vpc.id) security_group.create_tags(Tags=[{ "Key": "Name", "Value": SECURITY_GROUP_NAME }]) # allow ICMP access for public ping security_group.authorize_ingress(CidrIp='0.0.0.0/0', IpProtocol='icmp', FromPort=-1, ToPort=-1) # open public ports # always include SSH port which is required for basic functionality assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" for port in PUBLIC_TCP_RANGES: if u.is_list_or_tuple(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) for port in PUBLIC_UDP_RANGES: if u.is_list_or_tuple(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) # allow ingress within security group # Authorizing ingress doesn't work with names in a non-default VPC, # so must use more complicated syntax # https://github.com/boto/boto3/issues/158 for protocol in ['icmp']: try: rule = { 'FromPort': -1, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': -1, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } security_group.authorize_ingress(IpPermissions=[rule]) except Exception as e: if e.response['Error'][ 'Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) for protocol in ['tcp', 'udp']: try: rule = { 'FromPort': 0, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': 65535, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } security_group.authorize_ingress(IpPermissions=[rule]) except Exception as e: if e.response['Error'][ 'Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) return vpc, security_group
def main(): # TODO: also bring down all the instances and wait for them to come down region = os.environ['AWS_DEFAULT_REGION'] if DEFAULT_NAME == 'nexus': print("Nexus resources are protected, don't delete them") sys.exit() print("Deleting %s resources in region %s" % ( DEFAULT_NAME, region, )) existing_vpcs = u.get_vpc_dict() client = u.create_ec2_client() ec2 = u.create_ec2_resource() def response_type(response): return 'ok' if u.is_good_response(response) else 'failed' # delete EFS efss = u.get_efs_dict() efs_id = efss.get(DEFAULT_NAME, '') efs_client = u.create_efs_client() if efs_id: try: # delete mount targets first print("About to delete %s (%s)" % (efs_id, DEFAULT_NAME)) response = efs_client.describe_mount_targets(FileSystemId=efs_id) assert u.is_good_response(response) for mount_response in response['MountTargets']: subnet = ec2.Subnet(mount_response['SubnetId']) zone = subnet.availability_zone state = mount_response['LifeCycleState'] id = mount_response['MountTargetId'] ip = mount_response['IpAddress'] sys.stdout.write('Deleting mount target %s ... ' % (id, )) sys.stdout.flush() response = efs_client.delete_mount_target(MountTargetId=id) print(response_type(response)) sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, DEFAULT_NAME)) sys.stdout.flush() u.delete_efs_id(efs_id) except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') if VPC_NAME in existing_vpcs: vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id) print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) for subnet in vpc.subnets.all(): try: sys.stdout.write("Deleting subnet %s ... " % (subnet.id)) sys.stdout.write(response_type(subnet.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') for gateway in vpc.internet_gateways.all(): sys.stdout.write("Deleting gateway %s ... " % (gateway.id)) # todo: if instances are using VPC, this fails with # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. sys.stdout.write('detached ... ' if u.is_good_response( gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') sys.stdout.write('deleted ' if u.is_good_response(gateway.delete( )) else ' delete_failed ') sys.stdout.write('\n') def desc(route_table): return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) for route_table in vpc.route_tables.all(): sys.stdout.write("Deleting route table %s ... " % (desc(route_table))) try: sys.stdout.write(response_type(route_table.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') def desc(security_group): return "%s (%s, %s)" % (security_group.id, u.get_name(security_group.tags), security_group.group_name) # TODO: this tries to remove default security group, maybe not remove it? for security_group in vpc.security_groups.all(): sys.stdout.write('Deleting security group %s ... ' % (desc(security_group))) try: sys.stdout.write(response_type(security_group.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') sys.stdout.write("Deleting VPC %s ... " % (vpc.id)) sys.stdout.write(response_type(vpc.delete()) + '\n') # delete keypair keypairs = u.get_keypair_dict() keypair = keypairs.get(DEFAULT_NAME, '') if keypair: try: sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, DEFAULT_NAME)) sys.stdout.write(response_type(keypair.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') keypair_fn = u.get_keypair_fn(KEYPAIR_NAME) if os.path.exists(keypair_fn): print("Deleting local keypair file %s" % (keypair_fn, )) os.system('rm -f ' + keypair_fn)
help=("which resources to delete, all/network/keypair/efs")) parser.add_argument('--force-delete-efs', action='store_true', help="force deleting main EFS") args = parser.parse_args() EFS_NAME = u.get_resource_name() VPC_NAME = u.get_resource_name() SECURITY_GROUP_NAME = u.get_resource_name() ROUTE_TABLE_NAME = u.get_resource_name() KEYPAIR_NAME = u.get_keypair_name() EFS_NAME = u.get_resource_name() client = u.create_ec2_client() ec2 = u.create_ec2_resource() def response_type(response): return 'ok' if u.is_good_response(response) else 'failed' def delete_efs(): efss = u.get_efs_dict() efs_id = efss.get(EFS_NAME, '') efs_client = u.create_efs_client() if efs_id: try: # delete mount targets first print("About to delete %s (%s)" % (efs_id, EFS_NAME)) response = efs_client.describe_mount_targets(FileSystemId=efs_id)
print_response(inspect.getframeinfo(inspect.currentframe())[2], route) def describe_route_tables(ec2_client): # https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.Client.describe_route_tables response = ec2_client.describe_route_tables() print_response(inspect.getframeinfo(inspect.currentframe())[2], response) if __name__ == '__main__': aws = {} # profileを使い分ける場合には、profileをセット session = boto3.Session(profile_name='my-profile') # 使用するクライアントとリソースを作成 client = create_ec2_client(session) resource = create_ec2_resource(session) # VPCの作成と確認 aws['vpc_id'] = create_vpc(client) add_vpc_name_tag(resource, aws['vpc_id']) describe_vpc(client) # サブネットの作成 # アベイラビリティゾーンの確認 zones = describe_availability_zones(client) # 最初のアベイラビリティゾーンを使用するアベイラビリティゾーンとする first_zone = zones['AvailabilityZones'][0]['ZoneName'] print_response('first availability zone', first_zone) subnet = create_vpc_subnet(resource, aws['vpc_id'], first_zone, '192.168.1.0/24') aws['public_subnet_id'] = subnet.subnet_id # サブネットの名前タグを追加
def make_job(self, role_name, num_tasks=1, **kwargs): assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instances = u.lookup_aws_instances(job_name) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs['ami'] instance_type = kwargs['instance_type'] availability_zone = kwargs['availability_zone'] placement_group = kwargs.get('placement_group', '') install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') user_data = kwargs.get('user_data', '') if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + job_name) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % ( availability_zone, region, ', '.join(subnet_dict.keys())) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg args['UserData'] = user_data instances = ec2.create_instances(**args) assert len(instances) == num_tasks # assign proper names to tasks for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" task_name = u.format_task_name( instance.ami_launch_index, role_name, self.name) # TODO: use instance.create_tags instead like in create_resources.py ec2.create_tags(Resources=[instance.id], Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job