def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions]: try: print_verbose( os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) all_launch_configs = as_conn.get_all_launch_configurations() as_groups = as_conn.get_all_groups() for launch_config in all_launch_configs: if not [g for g in as_groups if g.launch_config_name == launch_config.name]: print_verbose(os.path.basename( __file__), 'info', "Launch config %s looks to be abandoned." % launch_config.name) if not dry_run: print_verbose( os.path.basename(__file__), 'info', "DESTROY!") kill_with_fire(launch_config) print_verbose( os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except Exception as e: handle_exception(e) sys.exit(1)
def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [ r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions ]: try: print_verbose(os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) all_launch_configs = as_conn.get_all_launch_configurations() as_groups = as_conn.get_all_groups() for launch_config in all_launch_configs: if not [ g for g in as_groups if g.launch_config_name == launch_config.name ]: print_verbose( os.path.basename(__file__), 'info', "Launch config %s looks to be abandoned." % launch_config.name) if not dry_run: print_verbose(os.path.basename(__file__), 'info', "DESTROY!") kill_with_fire(launch_config) print_verbose(os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except Exception as e: handle_exception(e) sys.exit(1)
def get_ondemand_price(launch_config): try: region = launch_config.connection.region.name ec2_conn = boto.ec2.connect_to_region(region) ec2_conn.get_image(launch_config.image_id) url = get_price_url(launch_config) resp = requests.get(url) # need to remove comments and callback syntax before parsing the broken # json json_str = str(resp.text.split('callback(')[1])[:-2] prices_dict = demjson.decode(json_str)['config']['regions'] regional_prices_json = [ r for r in prices_dict if r['region'] == region][0]['instanceTypes'] instance_class_prices_json = [r for r in regional_prices_json if launch_config.instance_type in [ e['size'] for e in r['sizes']]][0]['sizes'] price = float([e for e in instance_class_prices_json if e[ 'size'] == launch_config.instance_type][0]['valueColumns'][0]['prices']['USD']) print_verbose(os.path.basename(__file__), 'info', "On demand price for %s in %s is %s" % ( launch_config.instance_type, region, price)) return price except Exception as e: handle_exception(e) sys.exit(1)
def find_best_bid_price(as_group): try: prices = get_current_spot_prices(as_group) print_verbose(os.path.basename(__file__), 'info', prices) if len(prices) != len(get_usable_zones(as_group)): raise Exception( "Different number of AZs found than expected. Prices = %s\nAZs = %s" % (str(prices), str(get_usable_zones(as_group)))) best_bid = sorted( prices, key=lambda price: price.price)[int(get_min_azs(as_group)) - 1].price print_verbose(os.path.basename(__file__), 'info', 'best_bid=', best_bid) max_bid = get_max_bid(as_group) print_verbose(os.path.basename(__file__), 'info', 'max_bid=', max_bid) if get_rounded_price(best_bid) >= get_rounded_price(max_bid) or \ get_rounded_price(get_bid(as_group)) >= get_rounded_price(get_ondemand_price(get_launch_config(as_group))): # since ondemand instances are faster to spin up and more # available, if demand and max_bid are equal, ondemand should win # out. return False else: return get_rounded_price(best_bid) except Exception as e: handle_exception(e) sys.exit(1)
def get_ondemand_price(launch_config): try: region = launch_config.connection.region.name ec2_conn = boto.ec2.connect_to_region(region) ec2_conn.get_image(launch_config.image_id) url = get_price_url(launch_config) resp = requests.get(url) # need to remove comments and callback syntax before parsing the broken # json json_str = str(resp.text.split('callback(')[1])[:-2] prices_dict = demjson.decode(json_str)['config']['regions'] regional_prices_json = [ r for r in prices_dict if r['region'] == region ][0]['instanceTypes'] instance_class_prices_json = [ r for r in regional_prices_json if launch_config.instance_type in [e['size'] for e in r['sizes']] ][0]['sizes'] price = float([ e for e in instance_class_prices_json if e['size'] == launch_config.instance_type ][0]['valueColumns'][0]['prices']['USD']) print_verbose( os.path.basename(__file__), 'info', "On demand price for %s in %s is %s" % (launch_config.instance_type, region, price)) return price except Exception as e: handle_exception(e) sys.exit(1)
def modify_price(as_group, new_bid, dry_run, minutes_multiplier=None, demand_expiration=None): try: as_group = reload_as_group(as_group) as_conn = boto.ec2.autoscale.connect_to_region( as_group.connection.region.name) old_launch_config = get_launch_config(as_group) new_launch_config_name = old_launch_config.name[ :-13] + 'ssr' + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) launch_config = LaunchConfiguration( image_id=old_launch_config.image_id, key_name=old_launch_config.key_name, security_groups=old_launch_config.security_groups, user_data=old_launch_config.user_data, instance_type=old_launch_config.instance_type, kernel_id=old_launch_config.kernel_id, ramdisk_id=old_launch_config.ramdisk_id, block_device_mappings=old_launch_config.block_device_mappings, instance_monitoring=old_launch_config.instance_monitoring.enabled, instance_profile_name=old_launch_config.instance_profile_name, ebs_optimized=old_launch_config.ebs_optimized, associate_public_ip_address=old_launch_config.associate_public_ip_address, volume_type=old_launch_config.volume_type, delete_on_termination=old_launch_config.delete_on_termination, iops=old_launch_config.iops, use_block_device_types=old_launch_config.use_block_device_types, spot_price=new_bid, # new values name=new_launch_config_name, ) as_conn.create_launch_configuration(launch_config) print_verbose(os.path.basename( __file__), 'info', "Created LC %s with price %s." % (launch_config.name, new_bid)) as_groups = [a for a in as_group.connection.get_all_groups( ) if old_launch_config.name == a.launch_config_name] for as_group in as_groups: as_group.launch_config_name = launch_config.name if not dry_run: print_verbose(os.path.basename(__file__), 'info', "Applying new LC to ASG %s" % as_group.name) as_group.update() set_tag_dict_value( as_group, 'ssr_config', 'LC_name', launch_config.name[-155:]) if not new_bid: set_tag_dict_value(as_group, 'ssr_config', 'demand_expiration', int( time.time()) + (demand_expiration * minutes_multiplier)) modify_as_group_azs( as_group, get_usable_zones(as_group), dry_run) print_verbose(os.path.basename(__file__), 'info', "Autoscaling group launch configuration update complete.") print_verbose(os.path.basename(__file__), 'info', "Deleting old launch_config: %s" % old_launch_config) old_launch_config.delete() # XXX is this actually working? except Exception as e: handle_exception(e) sys.exit(1)
def terminate_instance(instance): try: instance.connection.terminate_instance( instance.instance_id, decrement_capacity=False) time.sleep(30) except BotoServerError as e: throttle_response(e) return terminate_instance(instance) except Exception as e: handle_exception(e) sys.exit(1)
def terminate_instance(instance): try: instance.connection.terminate_instance(instance.instance_id, decrement_capacity=False) time.sleep(30) except BotoServerError as e: throttle_response(e) return terminate_instance(instance) except Exception as e: handle_exception(e) sys.exit(1)
def get_max_bid(as_group): try: demand_price = get_ondemand_price(get_launch_config(as_group)) original_bid = get_tag_dict_value(as_group, 'ssr_config')['original_bid'] if get_rounded_price(demand_price) <= get_rounded_price(original_bid): return original_bid else: return demand_price except Exception as e: handle_exception(e) sys.exit(1)
def get_max_bid(as_group): try: demand_price = get_ondemand_price(get_launch_config(as_group)) original_bid = get_tag_dict_value( as_group, 'ssr_config')['original_bid'] if get_rounded_price(demand_price) <= get_rounded_price(original_bid): return original_bid else: return demand_price except Exception as e: handle_exception(e) sys.exit(1)
def init_az_status_tag(as_group): try: potential_zones = get_potential_azs(as_group) ec2_conn = boto.ec2.connect_to_region(as_group.connection.region.name) all_zones = ec2_conn.get_all_zones() zone_dict = {} for zone in all_zones: if zone.name in potential_zones: zone_dict[zone.name[-1]] = {"use": True, "health": [0, 0, 0]} else: zone_dict[zone.name[-1]] = {"use": False, "health": [0, 0, 0]} return create_tag(as_group, "AZ_status", zone_dict) except Exception as e: handle_exception(e) sys.exit(1)
def init_ssr_config_tag(as_group, min_healthy_azs): try: config_dict = { 'enabled': True, 'original_bid': get_bid(as_group), 'min_AZs': min_healthy_azs, # LC name size can be up to 255 chars (also tag value max length). # Final chars should be unique so we cut this short 'LC_name': as_group.launch_config_name[-155:], 'demand_expiration': False, } create_tag(as_group, 'ssr_config', config_dict) except Exception as e: handle_exception(e) sys.exit(1)
def modify_as_group_azs(as_group, healthy_zones, dry_run): try: as_group = reload_as_group(as_group) as_group.availability_zones = healthy_zones print_verbose( os.path.basename(__file__), 'info', "Updating with AZs %s" % healthy_zones) if not dry_run: as_group.update() except BotoServerError as e: if e.error_code == 'Throttling': print_verbose( os.path.basename(__file__), 'info', 'Pausing for aws throttling...') time.sleep(1) modify_as_group_azs(as_group, healthy_zones, dry_run) except Exception as e: handle_exception(e) sys.exit(1)
def modify_as_group_azs(as_group, healthy_zones, dry_run): try: as_group = reload_as_group(as_group) as_group.availability_zones = healthy_zones print_verbose(os.path.basename(__file__), 'info', "Updating with AZs %s" % healthy_zones) if not dry_run: as_group.update() except BotoServerError as e: if e.error_code == 'Throttling': print_verbose(os.path.basename(__file__), 'info', 'Pausing for aws throttling...') time.sleep(1) modify_as_group_azs(as_group, healthy_zones, dry_run) except Exception as e: handle_exception(e) sys.exit(1)
def main(args): (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [ r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions ]: try: print_verbose(os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_ssr_groups(as_conn) health_tags = [] for as_group in as_groups: bid = get_bid(as_group) current_prices = get_current_spot_prices(as_group) health_dict = {} if current_prices: print_verbose(os.path.basename(__file__), 'info', "Updating health for %s" % as_group.name) for price in current_prices: # * 1.1: #NOTE: potential feature to require a price buffer here? if price.price > bid: health_dict[price.availability_zone[-1]] = 1 else: health_dict[price.availability_zone[-1]] = 0 health_tags.append( update_az_health_list_tag(as_group, health_dict)) if health_tags and not dry_run: update_tags(as_conn, health_tags) print_verbose(os.path.basename(__file__), 'info', "All tags updated!") print_verbose(os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")
def maximize_elb_azs(elb_conn, as_group, dry_run): try: this_file = os.path.basename(__file__) for elb_name in as_group.load_balancers: elb = elb_conn.get_all_load_balancers(elb_name)[0] if not sorted(elb.availability_zones) == sorted(get_usable_zones(as_group)): print_verbose(os.path.basename( __file__), 'info', "AZs for ELB don't include all potential AZs. Removing unusable zones and adding the rest now.") if not dry_run: if len(list(set(elb.availability_zones) - set(get_usable_zones(as_group)))) > 0 or len(list(set(get_usable_zones(as_group)) - set(elb.availability_zones))) > 0: in_lb_but_not_asg = list( set(elb.availability_zones) - set(get_usable_zones(as_group))) in_asg_but_not_lb = list( set(get_usable_zones(as_group)) - set(elb.availability_zones)) if len(in_asg_but_not_lb) > 0 or len(in_lb_but_not_asg) > 0: try: if len(list(set(elb.availability_zones) - set(get_usable_zones(as_group)))) > 0: elb.disable_zones( list(set(elb.availability_zones) - set(get_usable_zones(as_group)))) elb.enable_zones(get_usable_zones(as_group)) except Exception as e: if e.error_code == 'ValidationError' and 'is constrained and cannot be used together with' in e.message: print_verbose( this_file, 'info', 'Conflict found between two AZs. Removing one of them from use.') pattern = re.compile( r'\b\w{2}-\w{4,}-\d\w and \w{2}-\w{4,}-\d\w\b') match = pattern.search(e.message) if match: bad_az = match.group().split()[0].split( '-')[-1][1] print_verbose( this_file, 'info', 'Removing %s from potential AZs as it confilcts with another AZ.' % bad_az) # smarter here would be to figure out # which AZ is a better choice new_tag = mark_asg_az_disabled( as_group, bad_az) update_tags( as_group.connection, [new_tag]) except Exception as e: handle_exception(e) sys.exit(1)
def main(args): (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions]: try: print_verbose( os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_ssr_groups(as_conn) health_tags = [] for as_group in as_groups: bid = get_bid(as_group) current_prices = get_current_spot_prices(as_group) health_dict = {} if current_prices: print_verbose( os.path.basename(__file__), 'info', "Updating health for %s" % as_group.name) for price in current_prices: # * 1.1: #NOTE: potential feature to require a price buffer here? if price.price > bid: health_dict[price.availability_zone[-1]] = 1 else: health_dict[price.availability_zone[-1]] = 0 health_tags.append( update_az_health_list_tag(as_group, health_dict)) if health_tags and not dry_run: update_tags(as_conn, health_tags) print_verbose( os.path.basename(__file__), 'info', "All tags updated!") print_verbose( os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")
def find_best_bid_price(as_group): try: prices = get_current_spot_prices(as_group) print_verbose(os.path.basename(__file__), 'info', prices) if len(prices) != len(get_usable_zones(as_group)): raise Exception("Different number of AZs found than expected. Prices = %s\nAZs = %s" % ( str(prices), str(get_usable_zones(as_group)))) best_bid = sorted(prices, key=lambda price: price.price)[ int(get_min_azs(as_group)) - 1].price print_verbose( os.path.basename(__file__), 'info', 'best_bid=', best_bid) max_bid = get_max_bid(as_group) print_verbose(os.path.basename(__file__), 'info', 'max_bid=', max_bid) if get_rounded_price(best_bid) >= get_rounded_price(max_bid) or \ get_rounded_price(get_bid(as_group)) >= get_rounded_price(get_ondemand_price(get_launch_config(as_group))): # since ondemand instances are faster to spin up and more # available, if demand and max_bid are equal, ondemand should win # out. return False else: return get_rounded_price(best_bid) except Exception as e: handle_exception(e) sys.exit(1)
def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [ r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions ]: try: print_verbose(os.path.basename(__file__), 'info', 'Starting pass on %s' % region) ec2_conn = boto.ec2.connect_to_region(region) as_conn = boto.ec2.autoscale.connect_to_region(region) all_groups = as_conn.get_all_groups() spot_lcs = [ e for e in as_conn.get_all_launch_configurations() if e.spot_price ] # these need to be pulled from the same all_groups list or # duplicate objects will be seen as distinct. spot_lc_groups = [ g for g in all_groups if g.launch_config_name in [s.name for s in spot_lcs] ] previously_ssr_managed_groups = [ g for g in all_groups if get_tag_dict_value(g, 'ssr_config') and get_tag_dict_value(g, 'ssr_config')['enabled'] is True ] all_groups = list( set(spot_lc_groups + previously_ssr_managed_groups)) for as_group in all_groups: print_verbose(os.path.basename(__file__), 'info', "Evaluating %s" % as_group.name) # this latter condition can happen when tag value (a dict) # can't be interpreted by ast.literal_eval() if args.reset_tags or not [ t for t in as_group.tags if t.key == 'ssr_config' ] or not get_tag_dict_value(as_group, 'ssr_config'): print_verbose( os.path.basename(__file__), 'info', 'Tags not found or reset tags option flagged. Adding all tags anew now.' ) init_ssr_config_tag(as_group, args.min_healthy_AZs) init_az_status_tag(as_group) elif [ t for t in as_group.tags if t.key == 'ssr_config' and not get_tag_dict_value( as_group, 'ssr_config')['enabled'] ]: print_verbose(os.path.basename(__file__), 'info', 'ssr_config DISABLED. Doing nothing.') elif [ t for t in as_group.tags if t.key == 'ssr_config' and get_tag_dict_value( as_group, 'ssr_config')['enabled'] ]: print_verbose( os.path.basename(__file__), 'info', 'ssr management enabled. Verifying all config values in place.' ) config_keys = [ 'enabled', 'original_bid', 'LC_name', 'min_AZs', 'demand_expiration' ] if not verify_tag_dict_keys(as_group, 'ssr_config', config_keys) or \ not get_tag_dict_value(as_group, 'ssr_config')['LC_name'] == as_group.launch_config_name[-155:]: # this would indicate a change to the LC outside of ssr # scope. In that case, we need to disable ssr via tag # deletion. if not get_launch_config(as_group).spot_price: del_ssr_tags(as_group) continue else: init_ssr_config_tag(as_group, args.min_healthy_AZs) zones = [z.name[-1] for z in ec2_conn.get_all_zones()] if not verify_tag_dict_keys(as_group, 'AZ_status', zones): init_az_status_tag(as_group) else: raise Exception( "ssr_enabled tag found for %s but isn't a valid value." % (as_group.name, )) print_verbose(os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except BotoServerError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")
def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions]: try: print_verbose( os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_ssr_groups(as_conn) elb_conn = boto.ec2.elb.connect_to_region(as_conn.region.name) minutes_multiplier = 60 for as_group in as_groups: as_group = reload_as_group(as_group) print_verbose( os.path.basename(__file__), 'info', "Checking %s" % as_group.name) if as_group.load_balancers: maximize_elb_azs(elb_conn, as_group, dry_run) demand_expiration = get_tag_dict_value( as_group, 'ssr_config')['demand_expiration'] healthy_zones = get_healthy_zones( as_group, args.min_health_threshold) if demand_expiration is not False: if demand_expiration < int(time.time()): if len(healthy_zones) >= get_min_azs(as_group): print_verbose(os.path.basename( __file__), 'info', 'Woot! We can move back to spots at original bid price.') modify_as_group_azs( as_group, healthy_zones, dry_run) modify_price( as_group, get_tag_dict_value(as_group, 'ssr_config')['original_bid'], dry_run) set_tag_dict_value( as_group, 'ssr_config', 'demand_expiration', False) # kill all demand instances that were created ec2_conn = boto.ec2.connect_to_region( as_group.connection.region.name) all_ec2_instances = ec2_conn.get_all_instances() print_verbose(os.path.basename( __file__), 'info', "Looking at %s instances for potential termination" % str(len(as_group.instances))) for instance in as_group.instances: if not [i for i in all_ec2_instances if i.instances[0].id == instance.instance_id][0].instances[0].spot_instance_request_id and \ not dry_run: terminate_instance(instance) else: print_verbose(os.path.basename( __file__), 'info', 'Extending the life of demand instances as we cant fulfill with spots still') set_tag_dict_value(as_group, 'ssr_config', 'demand_expiration', int( time.time()) + (args.demand_expiration * minutes_multiplier)) elif sorted(as_group.availability_zones) != sorted(healthy_zones): as_group = reload_as_group(as_group) print_verbose( os.path.basename(__file__), 'info', "Healthy zones and zones in use dont match") if len(healthy_zones) >= get_min_azs(as_group): print_verbose( os.path.basename(__file__), 'info', 'Modifying zones accordingly.') modify_as_group_azs(as_group, healthy_zones, dry_run) else: print_verbose(os.path.basename( __file__), 'info', "Bid will need to be modified as we can't meet AZ minimum of %s" % str(get_min_azs(as_group))) best_bid = find_best_bid_price(as_group) print_verbose(os.path.basename( __file__), 'info', "Best possible bid given AZ minimum is %s" % str(best_bid)) if best_bid: modify_price(as_group, best_bid, dry_run) else: print_verbose( os.path.basename(__file__), 'info', "Moving to ondemand.") modify_price( as_group, None, dry_run, minutes_multiplier, args.demand_expiration) set_tag_dict_value(as_group, 'ssr_config', 'demand_expiration', int( time.time()) + (args.demand_expiration * minutes_multiplier)) modify_as_group_azs( as_group, get_usable_zones(as_group), dry_run) else: print_verbose( os.path.basename(__file__), 'info', 'No further actions to take on this ASG.') print_verbose( os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")
def main(args): (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) this_file = os.path.basename(__file__) for region in [r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions]: try: ec2_conn = boto.ec2.connect_to_region(region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_all_as_groups(as_conn) all_spot_lcs = get_spot_lcs(as_conn) pending_requests = [] bad_statuses = json.loads('''{"status-code": [ "capacity-not-available", "capacity-oversubscribed", "price-too-low", "not-scheduled-yet", "launch-group-constraint", "az-group-constraint", "placement-group-constraint", "constraint-not-fulfillable" ]}''') pending_requests.append( ec2_conn.get_all_spot_instance_requests(filters=bad_statuses)) oldest_time = datetime.utcnow() - timedelta(minutes=args.minutes) # flattening the list of lists here pending_requests = [ item for sublist in pending_requests for item in sublist] health_tags = [] for request in pending_requests: if any('ElasticMapReduce' in sec_group.name for sec_group in request.launch_specification.groups): print_verbose( this_file, 'info', "This request belongs to the ElasticMapReduce group and will not be SSR managed.") continue if oldest_time > datetime.strptime(request.create_time, "%Y-%m-%dT%H:%M:%S.000Z"): print_verbose( this_file, 'info', "Bad request found. Identifying LC and associated ASGs to tag AZ health.") launch_configs = [lc for lc in all_spot_lcs if request.price == lc.spot_price and request.launch_specification.instance_type == lc.instance_type and request.launch_specification.instance_profile['name'] == lc.instance_profile_name and request.launch_specification.image_id == lc.image_id] # This could be made hella specific if we want to go that route if len(launch_configs) != 1: raise Exception( "Only one launch config should be found. You may need to run remove_old_launch_configs.py to clear this: %s" % launch_configs) else: launch_config = launch_configs[0] offending_as_groups = [ g for g in as_groups if g.launch_config_name == launch_config.name] bad_az = request.launch_group.split( request.region.name)[1][0] health_dict = {bad_az: 1} for as_group in offending_as_groups: print_verbose( this_file, 'info', "The following AZ will be tagged as an offender: %s." % str(as_group)) health_tags.append( update_az_health_list_tag(as_group, health_dict)) print_verbose( this_file, 'info', "Killing spot request %s." % str(request.id)) if not args.dry_run: request.cancel() update_tags(as_conn, health_tags) else: print_verbose(this_file, 'info', "PSYCH! Dry run.") else: print_verbose(this_file, 'info', "Request %s not older than %s minutes. Continuing..." % ( request.id, str(args.minutes))) print_verbose( this_file, 'info', "Region %s pass complete." % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) sys.exit(1) print_verbose(this_file, 'info', "All regions complete")
def main(args): (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) this_file = os.path.basename(__file__) for region in [ r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions ]: try: ec2_conn = boto.ec2.connect_to_region(region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_all_as_groups(as_conn) all_spot_lcs = get_spot_lcs(as_conn) pending_requests = [] bad_statuses = json.loads('''{"status-code": [ "capacity-not-available", "capacity-oversubscribed", "price-too-low", "not-scheduled-yet", "launch-group-constraint", "az-group-constraint", "placement-group-constraint", "constraint-not-fulfillable" ]}''') pending_requests.append( ec2_conn.get_all_spot_instance_requests(filters=bad_statuses)) oldest_time = datetime.utcnow() - timedelta(minutes=args.minutes) # flattening the list of lists here pending_requests = [ item for sublist in pending_requests for item in sublist ] health_tags = [] for request in pending_requests: if any('ElasticMapReduce' in sec_group.name for sec_group in request.launch_specification.groups): print_verbose( this_file, 'info', "This request belongs to the ElasticMapReduce group and will not be SSR managed." ) continue if oldest_time > datetime.strptime(request.create_time, "%Y-%m-%dT%H:%M:%S.000Z"): print_verbose( this_file, 'info', "Bad request found. Identifying LC and associated ASGs to tag AZ health." ) launch_configs = [ lc for lc in all_spot_lcs if request.price == lc.spot_price and request.launch_specification.instance_type == lc.instance_type and request.launch_specification. instance_profile['name'] == lc.instance_profile_name and request.launch_specification.image_id == lc.image_id ] # This could be made hella specific if we want to go that route if len(launch_configs) != 1: raise Exception( "Only one launch config should be found. You may need to run remove_old_launch_configs.py to clear this: %s" % launch_configs) else: launch_config = launch_configs[0] offending_as_groups = [ g for g in as_groups if g.launch_config_name == launch_config.name ] bad_az = request.launch_group.split( request.region.name)[1][0] health_dict = {bad_az: 1} for as_group in offending_as_groups: print_verbose( this_file, 'info', "The following AZ will be tagged as an offender: %s." % str(as_group)) health_tags.append( update_az_health_list_tag(as_group, health_dict)) print_verbose(this_file, 'info', "Killing spot request %s." % str(request.id)) if not args.dry_run: request.cancel() update_tags(as_conn, health_tags) else: print_verbose(this_file, 'info', "PSYCH! Dry run.") else: print_verbose( this_file, 'info', "Request %s not older than %s minutes. Continuing..." % (request.id, str(args.minutes))) print_verbose(this_file, 'info', "Region %s pass complete." % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) sys.exit(1) print_verbose(this_file, 'info', "All regions complete")
def modify_price(as_group, new_bid, dry_run, minutes_multiplier=None, demand_expiration=None): try: as_group = reload_as_group(as_group) as_conn = boto.ec2.autoscale.connect_to_region( as_group.connection.region.name) old_launch_config = get_launch_config(as_group) new_launch_config_name = old_launch_config.name[:-13] + 'ssr' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) launch_config = LaunchConfiguration( image_id=old_launch_config.image_id, key_name=old_launch_config.key_name, security_groups=old_launch_config.security_groups, user_data=old_launch_config.user_data, instance_type=old_launch_config.instance_type, kernel_id=old_launch_config.kernel_id, ramdisk_id=old_launch_config.ramdisk_id, block_device_mappings=old_launch_config.block_device_mappings, instance_monitoring=old_launch_config.instance_monitoring.enabled, instance_profile_name=old_launch_config.instance_profile_name, ebs_optimized=old_launch_config.ebs_optimized, associate_public_ip_address=old_launch_config. associate_public_ip_address, volume_type=old_launch_config.volume_type, delete_on_termination=old_launch_config.delete_on_termination, iops=old_launch_config.iops, use_block_device_types=old_launch_config.use_block_device_types, spot_price=new_bid, # new values name=new_launch_config_name, ) as_conn.create_launch_configuration(launch_config) print_verbose( os.path.basename(__file__), 'info', "Created LC %s with price %s." % (launch_config.name, new_bid)) as_groups = [ a for a in as_group.connection.get_all_groups() if old_launch_config.name == a.launch_config_name ] for as_group in as_groups: as_group.launch_config_name = launch_config.name if not dry_run: print_verbose(os.path.basename(__file__), 'info', "Applying new LC to ASG %s" % as_group.name) as_group.update() set_tag_dict_value(as_group, 'ssr_config', 'LC_name', launch_config.name[-155:]) if not new_bid: set_tag_dict_value( as_group, 'ssr_config', 'demand_expiration', int(time.time()) + (demand_expiration * minutes_multiplier)) modify_as_group_azs(as_group, get_usable_zones(as_group), dry_run) print_verbose( os.path.basename(__file__), 'info', "Autoscaling group launch configuration update complete.") print_verbose(os.path.basename(__file__), 'info', "Deleting old launch_config: %s" % old_launch_config) old_launch_config.delete() # XXX is this actually working? except Exception as e: handle_exception(e) sys.exit(1)
def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [ r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions ]: try: print_verbose(os.path.basename(__file__), 'info', 'Starting pass on %s' % region) as_conn = boto.ec2.autoscale.connect_to_region(region) as_groups = get_ssr_groups(as_conn) elb_conn = boto.ec2.elb.connect_to_region(as_conn.region.name) minutes_multiplier = 60 for as_group in as_groups: as_group = reload_as_group(as_group) print_verbose(os.path.basename(__file__), 'info', "Checking %s" % as_group.name) if as_group.load_balancers: maximize_elb_azs(elb_conn, as_group, dry_run) demand_expiration = get_tag_dict_value( as_group, 'ssr_config')['demand_expiration'] healthy_zones = get_healthy_zones(as_group, args.min_health_threshold) if demand_expiration is not False: if demand_expiration < int(time.time()): if len(healthy_zones) >= get_min_azs(as_group): print_verbose( os.path.basename(__file__), 'info', 'Woot! We can move back to spots at original bid price.' ) modify_as_group_azs(as_group, healthy_zones, dry_run) modify_price( as_group, get_tag_dict_value( as_group, 'ssr_config')['original_bid'], dry_run) set_tag_dict_value(as_group, 'ssr_config', 'demand_expiration', False) # kill all demand instances that were created ec2_conn = boto.ec2.connect_to_region( as_group.connection.region.name) all_ec2_instances = ec2_conn.get_all_instances() print_verbose( os.path.basename(__file__), 'info', "Looking at %s instances for potential termination" % str(len(as_group.instances))) for instance in as_group.instances: if not [i for i in all_ec2_instances if i.instances[0].id == instance.instance_id][0].instances[0].spot_instance_request_id and \ not dry_run: terminate_instance(instance) else: print_verbose( os.path.basename(__file__), 'info', 'Extending the life of demand instances as we cant fulfill with spots still' ) set_tag_dict_value( as_group, 'ssr_config', 'demand_expiration', int(time.time()) + (args.demand_expiration * minutes_multiplier)) elif sorted( as_group.availability_zones) != sorted(healthy_zones): as_group = reload_as_group(as_group) print_verbose(os.path.basename(__file__), 'info', "Healthy zones and zones in use dont match") if len(healthy_zones) >= get_min_azs(as_group): print_verbose(os.path.basename(__file__), 'info', 'Modifying zones accordingly.') modify_as_group_azs(as_group, healthy_zones, dry_run) else: print_verbose( os.path.basename(__file__), 'info', "Bid will need to be modified as we can't meet AZ minimum of %s" % str(get_min_azs(as_group))) best_bid = find_best_bid_price(as_group) print_verbose( os.path.basename(__file__), 'info', "Best possible bid given AZ minimum is %s" % str(best_bid)) if best_bid: modify_price(as_group, best_bid, dry_run) else: print_verbose(os.path.basename(__file__), 'info', "Moving to ondemand.") modify_price(as_group, None, dry_run, minutes_multiplier, args.demand_expiration) set_tag_dict_value( as_group, 'ssr_config', 'demand_expiration', int(time.time()) + (args.demand_expiration * minutes_multiplier)) modify_as_group_azs(as_group, get_usable_zones(as_group), dry_run) else: print_verbose(os.path.basename(__file__), 'info', 'No further actions to take on this ASG.') print_verbose(os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")
def maximize_elb_azs(elb_conn, as_group, dry_run): try: this_file = os.path.basename(__file__) for elb_name in as_group.load_balancers: elb = elb_conn.get_all_load_balancers(elb_name)[0] if not sorted(elb.availability_zones) == sorted( get_usable_zones(as_group)): print_verbose( os.path.basename(__file__), 'info', "AZs for ELB don't include all potential AZs. Removing unusable zones and adding the rest now." ) if not dry_run: if len( list( set(elb.availability_zones) - set(get_usable_zones(as_group)))) > 0 or len( list( set(get_usable_zones(as_group)) - set(elb.availability_zones))) > 0: in_lb_but_not_asg = list( set(elb.availability_zones) - set(get_usable_zones(as_group))) in_asg_but_not_lb = list( set(get_usable_zones(as_group)) - set(elb.availability_zones)) if len(in_asg_but_not_lb) > 0 or len( in_lb_but_not_asg) > 0: try: if len( list( set(elb.availability_zones) - set(get_usable_zones(as_group))) ) > 0: elb.disable_zones( list( set(elb.availability_zones) - set(get_usable_zones(as_group)))) elb.enable_zones(get_usable_zones(as_group)) except Exception as e: if e.error_code == 'ValidationError' and 'is constrained and cannot be used together with' in e.message: print_verbose( this_file, 'info', 'Conflict found between two AZs. Removing one of them from use.' ) pattern = re.compile( r'\b\w{2}-\w{4,}-\d\w and \w{2}-\w{4,}-\d\w\b' ) match = pattern.search(e.message) if match: bad_az = match.group().split( )[0].split('-')[-1][1] print_verbose( this_file, 'info', 'Removing %s from potential AZs as it confilcts with another AZ.' % bad_az) # smarter here would be to figure out # which AZ is a better choice new_tag = mark_asg_az_disabled( as_group, bad_az) update_tags(as_group.connection, [new_tag]) except Exception as e: handle_exception(e) sys.exit(1)
def main(args): global verbose global dry_run (verbose, dry_run) = dry_run_necessaries(args.dry_run, args.verbose) for region in [r.name for r in boto.ec2.regions() if r.name not in args.excluded_regions]: try: print_verbose( os.path.basename(__file__), 'info', 'Starting pass on %s' % region) ec2_conn = boto.ec2.connect_to_region(region) as_conn = boto.ec2.autoscale.connect_to_region(region) all_groups = as_conn.get_all_groups() spot_lcs = [ e for e in as_conn.get_all_launch_configurations() if e.spot_price] # these need to be pulled from the same all_groups list or # duplicate objects will be seen as distinct. spot_lc_groups = [ g for g in all_groups if g.launch_config_name in [s.name for s in spot_lcs]] previously_ssr_managed_groups = [g for g in all_groups if get_tag_dict_value( g, 'ssr_config') and get_tag_dict_value(g, 'ssr_config')['enabled'] is True] all_groups = list( set(spot_lc_groups + previously_ssr_managed_groups)) for as_group in all_groups: print_verbose( os.path.basename(__file__), 'info', "Evaluating %s" % as_group.name) # this latter condition can happen when tag value (a dict) # can't be interpreted by ast.literal_eval() if args.reset_tags or not [t for t in as_group.tags if t.key == 'ssr_config'] or not get_tag_dict_value(as_group, 'ssr_config'): print_verbose(os.path.basename( __file__), 'info', 'Tags not found or reset tags option flagged. Adding all tags anew now.') init_ssr_config_tag(as_group, args.min_healthy_AZs) init_az_status_tag(as_group) elif [t for t in as_group.tags if t.key == 'ssr_config' and not get_tag_dict_value(as_group, 'ssr_config')['enabled']]: print_verbose( os.path.basename(__file__), 'info', 'ssr_config DISABLED. Doing nothing.') elif [t for t in as_group.tags if t.key == 'ssr_config' and get_tag_dict_value(as_group, 'ssr_config')['enabled']]: print_verbose(os.path.basename( __file__), 'info', 'ssr management enabled. Verifying all config values in place.') config_keys = [ 'enabled', 'original_bid', 'LC_name', 'min_AZs', 'demand_expiration'] if not verify_tag_dict_keys(as_group, 'ssr_config', config_keys) or \ not get_tag_dict_value(as_group, 'ssr_config')['LC_name'] == as_group.launch_config_name[-155:]: # this would indicate a change to the LC outside of ssr # scope. In that case, we need to disable ssr via tag # deletion. if not get_launch_config(as_group).spot_price: del_ssr_tags(as_group) continue else: init_ssr_config_tag(as_group, args.min_healthy_AZs) zones = [z.name[-1] for z in ec2_conn.get_all_zones()] if not verify_tag_dict_keys(as_group, 'AZ_status', zones): init_az_status_tag(as_group) else: raise Exception( "ssr_enabled tag found for %s but isn't a valid value." % (as_group.name,)) print_verbose( os.path.basename(__file__), 'info', 'Done with pass on %s' % region) except EC2ResponseError as e: handle_exception(e) except BotoServerError as e: handle_exception(e) except Exception as e: handle_exception(e) return 1 print_verbose(os.path.basename(__file__), 'info', "All regions complete")