def _retrieve_max_cluster_size(sqs_config, asg_name, fallback): try: _, _, max_size = get_asg_settings(sqs_config.region, sqs_config.proxy_config, asg_name) return max_size except Exception: return fallback
def _poll_scheduler_status(config, asg_name, scheduler_module, instance_properties): """ Verify scheduler status and ask the ASG new nodes, if required. :param config: JobwatcherConfig object :param asg_name: ASG name :param scheduler_module: scheduler module :param instance_properties: instance properties """ while True: # Get number of nodes requested pending = scheduler_module.get_required_nodes(instance_properties) if pending < 0: log.critical( "Error detecting number of required nodes. The cluster will not scale up." ) elif pending == 0: log.info("There are no pending jobs. Noop.") else: # Get current number of nodes running = scheduler_module.get_busy_nodes(instance_properties) log.info("%d nodes requested, %d nodes running", pending, running) # get current limits _, current_desired, max_size = get_asg_settings( config.region, config.proxy_config, asg_name, log) # Check to make sure requested number of instances is within ASG limits required = running + pending if required <= current_desired: log.info("%d nodes required, %d nodes in asg. Noop" % (required, current_desired)) else: if required > max_size: log.info( "The number of required nodes %d is greater than max %d. Requesting max %d." % (required, max_size, max_size)) else: log.info( "Setting desired to %d nodes, requesting %d more nodes from asg." % (required, required - current_desired)) requested = min(required, max_size) # update ASG asg_client = boto3.client('autoscaling', region_name=config.region, config=config.proxy_config) asg_client.update_auto_scaling_group( AutoScalingGroupName=asg_name, DesiredCapacity=requested) time.sleep(60)
def _poll_scheduler_status(config, asg_name, scheduler_module, instance_properties): """ Verify scheduler status and ask the ASG new nodes, if required. :param config: JobwatcherConfig object :param asg_name: ASG name :param scheduler_module: scheduler module :param instance_properties: instance properties """ while True: # Get number of nodes requested pending = scheduler_module.get_required_nodes(instance_properties) if pending < 0: log.critical("Error detecting number of required nodes. The cluster will not scale up.") elif pending == 0: log.info("There are no pending jobs. Noop.") else: # Get current number of nodes running = scheduler_module.get_busy_nodes(instance_properties) log.info("%d nodes requested, %d nodes running", pending, running) # get current limits _, current_desired, max_size = get_asg_settings(config.region, config.proxy_config, asg_name, log) # Check to make sure requested number of instances is within ASG limits required = running + pending if required <= current_desired: log.info("%d nodes required, %d nodes in asg. Noop" % (required, current_desired)) else: if required > max_size: log.info( "The number of required nodes %d is greater than max %d. Requesting max %d." % (required, max_size, max_size) ) else: log.info( "Setting desired to %d nodes, requesting %d more nodes from asg." % (required, required - current_desired) ) requested = min(required, max_size) # update ASG asg_client = boto3.client('autoscaling', region_name=config.region, config=config.proxy_config) asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=requested) time.sleep(60)
def _poll_scheduler_status(config, asg_name, scheduler_module): """ Verify scheduler status and ask the ASG new nodes, if required. :param config: JobwatcherConfig object :param asg_name: ASG name :param scheduler_module: scheduler module """ instance_type = None instance_properties = None update_instance_properties_timer = 0 while True: # Get instance properties if not instance_properties or update_instance_properties_timer >= UPDATE_INSTANCE_PROPERTIES_INTERVAL: logging.info("Refreshing compute instance properties") update_instance_properties_timer = 0 new_instance_type = get_compute_instance_type( config.region, config.proxy_config, config.stack_name, fallback=instance_type) if new_instance_type != instance_type: instance_type = new_instance_type instance_properties = get_instance_properties( config.region, config.proxy_config, instance_type) update_instance_properties_timer += LOOP_TIME # get current limits _, current_desired, max_size = get_asg_settings( config.region, config.proxy_config, asg_name) # Get number of nodes requested pending = scheduler_module.get_required_nodes(instance_properties, max_size) if pending < 0: log.critical( "Error detecting number of required nodes. The cluster will not scale up." ) elif pending == 0: log.info( "There are no pending jobs or the requirements on pending jobs cannot be satisfied. Noop." ) else: # Get current number of nodes running = scheduler_module.get_busy_nodes() log.info("%d nodes requested, %d nodes busy or unavailable", pending, running) # Check to make sure requested number of instances is within ASG limits required = running + pending if required <= current_desired: log.info("%d nodes required, %d nodes in asg. Noop" % (required, current_desired)) else: if required > max_size: log.info( "The number of required nodes %d is greater than max %d. Requesting max %d." % (required, max_size, max_size)) else: log.info( "Setting desired to %d nodes, requesting %d more nodes from asg." % (required, required - current_desired)) requested = min(required, max_size) # update ASG asg_client = boto3.client("autoscaling", region_name=config.region, config=config.proxy_config) asg_client.update_auto_scaling_group( AutoScalingGroupName=asg_name, DesiredCapacity=requested) time.sleep(LOOP_TIME)
def _poll_instance_status(config, scheduler_module, asg_name, hostname, instance_id, instance_type): """ Verify instance/scheduler status and self-terminate the instance. The instance will be terminate if not required and exceeded the configured scaledown_idletime. :param config: NodewatcherConfig object :param scheduler_module: scheduler module :param asg_name: ASG name :param hostname: current hostname :param instance_id: current instance id :param instance_type: current instance type """ _wait_for_stack_ready(config.stack_name, config.region, config.proxy_config) _terminate_if_down(scheduler_module, config, asg_name, instance_id, INITIAL_TERMINATE_TIMEOUT) idletime = _init_idletime() instance_properties = get_instance_properties(config.region, config.proxy_config, instance_type) start_time = None while True: sleep_remaining_loop_time(LOOP_TIME, start_time) start_time = datetime.now() max_cluster_size = _refresh_cluster_properties(config.region, config.proxy_config, asg_name) _store_idletime(idletime) _terminate_if_down(scheduler_module, config, asg_name, instance_id, TERMINATE_TIMEOUT) has_jobs = _has_jobs(scheduler_module, hostname) if has_jobs: log.info("Instance has active jobs.") idletime = 0 else: has_pending_jobs, error = scheduler_module.has_pending_jobs( instance_properties, max_cluster_size) if error: # In case of failure _terminate_if_down will take care of removing the node log.warning( "Encountered an error while polling queue for pending jobs. Considering node as busy" ) continue elif has_pending_jobs: log.info("Queue has pending jobs. Not terminating instance") idletime = 0 continue try: min_size, desired_capacity, max_size = get_asg_settings( config.region, config.proxy_config, asg_name) except Exception as e: logging.error( "Failed when retrieving ASG settings with exception %s", e) continue if desired_capacity <= min_size: log.info("Not terminating due to min cluster size reached") idletime = 0 else: idletime += 1 log.info("Instance had no job for the past %s minute(s)", idletime) if idletime >= config.scaledown_idletime: _lock_and_terminate(config.region, config.proxy_config, scheduler_module, hostname, instance_id) # _lock_and_terminate exits if termination is successful # set idletime to 0 if termination is aborted idletime = 0
def _poll_instance_status(config, scheduler_module, asg_name, hostname, instance_id, instance_type): """ Verify instance/scheduler status and self-terminate the instance. The instance will be terminate if not required and exceeded the configured scaledown_idletime. :param config: NodewatcherConfig object :param scheduler_module: scheduler module :param asg_name: ASG name :param hostname: current hostname :param instance_id: current instance id :param instance_type: current instance type """ _wait_for_stack_ready(config.stack_name, config.region, config.proxy_config) _terminate_if_down(scheduler_module, config, asg_name, instance_id, INITIAL_TERMINATE_TIMEOUT) idletime = _init_idletime() instance_properties = get_instance_properties(config.region, config.proxy_config, instance_type) while True: time.sleep(60) _store_idletime(idletime) _terminate_if_down(scheduler_module, config, asg_name, instance_id, TERMINATE_TIMEOUT) has_jobs = _has_jobs(scheduler_module, hostname) if has_jobs: log.info("Instance has active jobs.") idletime = 0 else: asg_conn = boto3.client("autoscaling", region_name=config.region, config=config.proxy_config) if _maintain_size(asg_name, asg_conn): log.info("Not terminating due to min cluster size reached") idletime = 0 else: _, _, max_size = get_asg_settings(config.region, config.proxy_config, asg_name) has_pending_jobs, error = scheduler_module.hasPendingJobs( instance_properties, max_size) if error: log.warning( "Encountered an error while polling queue for pending jobs. Skipping pending jobs check" ) elif has_pending_jobs: log.info( "Queue has pending jobs. Not terminating instance") idletime = 0 continue idletime += 1 log.info("Instance had no job for the past %s minute(s)", idletime) if idletime >= config.scaledown_idletime: _lock_host(scheduler_module, hostname) has_jobs = _has_jobs(scheduler_module, hostname) if has_jobs: log.info("Instance has active jobs.") idletime = 0 _lock_host(scheduler_module, hostname, unlock=True) continue if _maintain_size(asg_name, asg_conn): log.info( "Not terminating due to min cluster size reached") idletime = 0 else: _self_terminate(asg_conn, instance_id) _lock_host(scheduler_module, hostname, unlock=True)