def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) # Rstudio (GUI for R) needs port 8787 for web access master_group.authorize('tcp', 8787, 8787, authorized_address) # HDFS NFS gateway requires 111,2049,4242 for tcp & udp master_group.authorize('tcp', 111, 111, authorized_address) master_group.authorize('udp', 111, 111, authorized_address) master_group.authorize('tcp', 2049, 2049, authorized_address) master_group.authorize('udp', 2049, 2049, authorized_address) master_group.authorize('tcp', 4242, 4242, authorized_address) master_group.authorize('udp', 4242, 4242, authorized_address) # RM in YARN mode uses 8088 master_group.authorize('tcp', 8088, 8088, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) #Kylix slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag( key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag( key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): template_vars = { 'cluster_name':cluster_name, 'master_security_group': cluster_name + "-master", 'slave_security_group': cluster_name + "-slaves", 'discovery_security_group': cluster_name + "-discovery" } if opts.copy_aws_credentials: if opts.deploy_aws_key_id: template_vars['aws_key']=opts.deploy_aws_key_id else: template_vars['aws_key']=opts.aws_access_key_id if opts.deploy_aws_key_secret: template_vars['aws_secret']=opts.deploy_aws_key_secret else: template_vars['aws_secret']=opts.aws_secret_access_key if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") master_group = get_or_make_group(conn, template_vars['master_security_group'], opts.vpc_id) slave_group = get_or_make_group(conn, template_vars['slave_security_group'], opts.vpc_id) discovery_group = get_or_make_group(conn, template_vars['discovery_security_group'], opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=discovery_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=discovery_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize('tcp', 22, 22, authorized_address) if discovery_group.rules == []: # Group was just now created if opts.vpc_id is None: discovery_group.authorize(src_group=master_group) discovery_group.authorize(src_group=slave_group) discovery_group.authorize(src_group=discovery_group) else: discovery_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) discovery_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) discovery_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: all_groups = conn.get_all_security_groups() additional_group_ids = [] for group in opts.additional_security_group.split(','): additional_group_ids += [sg.id for sg in all_groups if group in (sg.name, sg.id)] template_vars['security_groups']= template_vars['discovery_security_group'] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: ebs_devices=[] for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device_id = "/dev/sd" + chr(ord('s') + i) device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map[device_id] = device ebs_devices+=device_id template_vars['ebs_devices']=' '.join(ebs_devices) # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): local_devices=[] for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev local_devices+=name template_vars['local_devices']=' '.join(local_devices) master_user_data_content = get_user_data(opts.master_user_data,template_vars) slave_user_data_content = get_user_data(opts.slave_user_data,template_vars) # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price is not None: # Launch spot instance with the requested price print("Requesting master as spot instance with price $%.3f" % (opts.spot_price)) master_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, key_name=opts.key_pair, launch_group="master-group-%s" % cluster_name, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_profile_name=opts.instance_profile_name) master_req_id = master_reqs[0].id print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r master_instance_ids = [] if master_req_id in id_to_req and id_to_req[master_req_id].state == "active": master_instance_ids.append(id_to_req[master_req_id].instance_id) print("Master granted") reservations = conn.get_all_reservations(master_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: print("Master not granted yet, waiting longer") except: print("Canceling spot instance request for master") conn.cancel_spot_instance_requests([master_req_id]) sys.exit(0) else: master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, num_nodes, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves: print("ERROR: There are already instances running in group %s" % slave_group.name, file=stderr) sys.exit(1) if opts.ami is None: print("ERROR: AMI is not set, exit") sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (num_nodes, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == num_nodes: print("All %d spot instances granted" % (num_nodes + 1)) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slave spot instances granted, waiting longer" % ( len(active_instance_ids), num_nodes)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: slave_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: print ("WARNING: --spot-price was not set; consider launch slaves as spot instances to save money") # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return slave_nodes
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Use the default Ubuntu AMI. if opts.ami is None: if opts.region == "us-east-1": opts.ami = "ami-2d39803a" elif opts.region == "us-west-1": opts.ami = "ami-06116566" elif opts.region == "us-west-2": opts.ami = "ami-9abea4fb" elif opts.region == "eu-west-1": opts.ami = "ami-f95ef58a" elif opts.region == "eu-central-1": opts.ami = "ami-87564feb" elif opts.region == "ap-northeast-1": opts.ami = "ami-a21529cc" elif opts.region == "ap-northeast-2": opts.ami = "ami-09dc1267" elif opts.region == "ap-southeast-1": opts.ami = "ami-25c00c46" elif opts.region == "ap-southeast-2": opts.ami = "ami-6c14310f" elif opts.region == "ap-south-1": opts.ami = "ami-4a90fa25" elif opts.region == "sa-east-1": opts.ami = "ami-0fb83963" else: raise Exception("The specified region is unknown.") # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [ sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id) ] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations( active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print( "Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))) # Return all the instances return (master_nodes, slave_nodes)