def api_add_job(self, args): check = self._api_check(args, ['dns_basename', 'task_configuration', 'deployment_layout', 'deployment_recipe', 'recipe_options', 'persistent']) if check: return check job = ProductionJob( self, args['dns_basename'], args['task_configuration'], args['deployment_layout'], args['deployment_recipe'], args['recipe_options'], args['persistent'], args.get('linked_job')) if args.get('linked_job'): job.find_linked_job() if not job.linked_job_object: return "Couldn't find linked job!" if self.state.add_job(job): ClusterEventManager.handle( "Added a job: %s" % job.get_name()) return "Job Added" else: return "Error adding job, see logs"
def api_update_logging_level(self, args): # Default level is INFO level = args.get('level', 20) for logger in self.state.loggers: logger.setLevel(level) ClusterEventManager.handle( "Updated logging level to %s" % level) return "Level set to %s" % level
def ensure_on_linked_job(self, state, sitter): """ 1. Ensure the linked job exists, if not bail out 2. Ensure that this job is running on each machine that the linked job is on. If not, create a job filler for those machines and this job. Note: As a linked job we should never create a job filler that spawns new machines. We should always just be populating existing machines. """ linked_job = self.find_linked_job() if not linked_job: logger.warn( "Couldn't find linked job (%s) for %s!" % ( self.linked_job, str(self))) # Returning False stops all other jobs this cycle, which # we don't want to do. return True job_fill_machines = state.get_job_machines() for zone in linked_job.get_shared_fate_zones(): machines_to_fill = [] machines = job_fill_machines.get(zone, []) for machine in machines: task_names = [ task['name'] for task in machine.get_running_tasks()] if not self.name in task_names: machines_to_fill.append(machine) current_fillers = self.fillers[zone] currently_spawning = 0 for filler in current_fillers: currently_spawning += filler.num_remaining() # Also check the linked job for active job fillers # we don't want to start a filler here if the linked job # is also actively filling, it should be sequential. current_fillers = linked_job.fillers[zone] for filler in current_fillers: currently_spawning += filler.num_remaining() if not currently_spawning and len(machines_to_fill) > 0: ClusterEventManager.handle( "New JobFiller for Linked Job: %s, %s, %s, %s" % ( machines_to_fill, zone, str(self), self.linked_job)) filler = JobFiller(len(machines_to_fill), self, zone, machines_to_fill) filler.start() self.fillers[zone].append(filler) return True
def run(self): logger.info("Starting JobFiller") release_attempts = 1 while self.state.get_state() != 8: state = self.state.get_state() logger.info( "Running State: %s, attempt #%s" % ( str(self.state), release_attempts)) try: if state == 0: self.run_create_resources() elif state == 1: self.ensure_dns() elif state == 2: self.deploy_monitoring_code() elif state == 3: self.deploy_job_code() elif state == 4: self.launch_tasks() elif state == 5: self.add_to_monitoring() elif state == 6: self.ensure_dns(do_basename=True) elif state == 7: self.reboot_dependent_jobs() except: release_attempts += 1 import traceback traceback.print_exc() logger.error(traceback.format_exc()) if release_attempts > 10 or self.fail_on_error: logger.info("Job Filler: Failed") ClusterEventManager.handle( "Failed Filling: %s" % str(self)) if self.post_callback: self.post_callback(self, success=False) return False ClusterEventManager.handle( "Completed Filling: %s" % str(self)) logger.info("Job Filler: Done!") self.end_time = datetime.now() if self in self.job.fillers.get(self.zone, []): self.job.fillers[self.zone].remove(self) if self.post_callback: self.post_callback(success=True) return True
def api_remove_job(self, args): check = self._api_check(args, ['name']) if check: return check jobs = self.state.remove_job(args['name']) ClusterEventManager.handle( "Removed jobs: %s" % ', '.join(jobs)) if jobs: return "Removed: %s" % ', '.join(jobs) else: return "Job Not Found"
def api_update_job(self, args): check = self._api_check(args, ['job_name']) if check: return check job_name = args['job_name'] if not self.state.update_job(job_name): return "Error updating job: %s doesn't exist" % job_name ClusterEventManager.handle( 'Update %s started' % job_name) return "Job update initiated"
def api_enforce_idle(self, args): # Really naive right now, a global # of # max idle per zone. Could do a lot more here. check = self._api_check(args, ['idle_count_per_zone']) if check: return check try: self.state.max_idle_per_zone = int(args['idle_count_per_zone']) except: return "Invalid limit" ClusterEventManager.handle( "Enforce Idle Limit at %s" % int(args['idle_count_per_zone'])) return "Limit set"
def decomission_machine(self, machine): self.state.remove_machine(machine) provider = self.state.get_zone_provider( machine.config.shared_fate_zone) if not provider: logger.warn( "No provider found for %s?" % machine.config.shared_fate_zone) return ClusterEventManager.handle( "Decomissioning %s" % str(machine)) if not provider.decomission(machine): # If we can't decomission it then perhaps its locked # and we should leave well enough alone at this point, # just remove it from monitoring etc. ClusterEventManager.handle( "Provider doesn't allow decomissioning of %s" % str(machine)) return if machine.config.dns_name: self.dns_provider.remove_record(data=machine.config.ip, hostName=machine.config.dns_name) # Strip off the leading number, e.g. # 12.bar.mydomain.com -> bar.mydomain.com root_name = '.'.join(machine.config.dns_name.split('.')[1:]) self.dns_provider.remove_record(data=machine.config.ip, hostName=root_name) # Now look for other dangling records pointing to this machine # and delete those too. records = self.dns_provider.get_records() for record in records: if record['value'] == machine.config.ip: logger.info( "Removing %s from %s" % ( machine.config.ip, record['record'])) self.dns_provider.remove_record(data=machine.config.ip, hostName=record['record']) ClusterEventManager.handle( "Decomissioning of %s complete!" % str(machine))
def refill(self, state, sitter): self.sitter = sitter new_machines = False while self.sitter.machines_in_queue(): new_machines = True # We want to ensure any machines recently added to monitoring # have had a chance to load their data, incase they are # running this job logger.info("Waiting for machine monitors to load machine data " "before filling jobs") time.sleep(0.5) if new_machines: # If we had to wait for new machines that means that # there are new machines, and we need to recalculate # job fill before it is safe to do refill. The next # pass should be OK. logger.info("Waiting for next jobfill to be calculated before " "doing a refill") return False while not self.name in state.job_fill: # 1) Assume this job has already been added to state.jobs # 2) Want to ensure calculator has run at least once to find out # if this job already exists throughout the cluster logger.info( "Waiting for calculator thread to kick in before " "filling jobs") time.sleep(0.5) # Clear out finished fillers after 5 minutes for zone, fillers in self.fillers.items(): for filler in fillers: now = datetime.now() if (filler.is_done() and now - filler.end_time > timedelta(minutes=5)): logger.info( "Removing a filler from %s for %s" % ( zone, self.name)) self.fillers[zone].remove(filler) # If we have a linked job then bypass all the normal logic # and just piggyback on those machines if self.linked_job: return self.ensure_on_linked_job(state, sitter) #!MACHINEASSUMPTION! # Step 1: Ensure we have enough machines in each SFZ # Step 1a: Check for idle machines and reserve as we find them for zone in self.get_shared_fate_zones(): idle_available = state.get_idle_machines(zone) total_required = self.get_num_required_machines_in_zone(zone) idle_required = total_required - state.job_fill[self.name][zone] current_fillers = self.fillers[zone] currently_spawning = 0 for filler in current_fillers: currently_spawning += filler.num_remaining() self.currently_spawning[zone] = currently_spawning idle_required -= currently_spawning # !MACHINEASSUMPTION! Ideally we're counting resources here # not machines required_new_machine_count = max( (idle_required - len(idle_available)), 0) do_log = logger.debug if idle_required > 0: do_log = logger.info do_log( ("Calculated job requirements for %s in %s: " % (self.name, zone)) + "Currently Active: %s " % (state.job_fill[self.name][zone]) + "Idle Required: %s, Total New: %s " % ( idle_required, required_new_machine_count) + "Currently Spawning: %s " % (currently_spawning) + "idle-available: %s " % (len(idle_available)) + "total_required: %s " % (total_required) ) usable_machines = [] if required_new_machine_count <= 0: # idle_available > idle_required, so use just as many # as we need usable_machines = idle_available[:idle_required] elif required_new_machine_count > 0: # Otherwise take all the available idle ones, and # we'll make more usable_machines.extend(idle_available) if idle_required > 0: ClusterEventManager.handle( "New JobFiller: %s, %s, %s, %s" % ( idle_required, zone, str(self), usable_machines)) filler = JobFiller(idle_required, self, zone, usable_machines) self.fillers[zone].append(filler) filler.start() return True
def get_live_data(self): data = {} data['events'] = ClusterEventManager.get_events() state = self.harness.state job_fill, job_machine_fill = state.current_jobs.get_job_fill() idle_machines = [] zoned_idle_machines = state.get_machines(idle=True) for zone, machines in zoned_idle_machines.iteritems(): idle_machines.extend(machines) data['providers'] = state.get_providers().keys() data['machines_by_zone'] = str(state.get_machines()) data['job_fill'] = str(job_fill) data['idle_machines'] = str(state.get_machines(idle=True)) data['unreachable_machines'] = [ str(m) for m in state.get_machines(unreachable=True)] monitors = [] machines = [] for monitor, thread in state.monitors: monitor_data = {} monitor_data['monitored_machines'] = [ repr(m) for m in monitor.monitored_machines] monitor_data['add_queue'] = [repr(m) for m in monitor.add_queue] monitor_data['pull_failures'] = dict([ (str(k), v) for k, v in monitor.pull_failures.iteritems()]) monitor_data['failure_threshold'] = monitor.failure_threshold monitor_data['number'] = monitor.number monitors.append(monitor_data) pull_failures = dict(monitor.pull_failures) for machine in monitor.monitored_machines: machine_data = machine.serialize() machine_data['pull_failures'] = pull_failures.get(machine, 0) machine_data['idle'] = machine in (zoned_idle_machines.get( machine_data['config']['shared_fate_zone'], [])) machines.append(machine_data) data['machines'] = machines data['monitors'] = monitors jobs = [] check_jobs = state.jobs.values() + state.repair_jobs.values() for job in check_jobs: job_data = {} job_data['name'] = job.name job_data['dns_basename'] = job.dns_basename job_data['task_configuration'] = job.task_configuration job_data['deployment_layout'] = job.deployment_layout job_data['deployment_recipe'] = job.deployment_recipe job_data['recipe_options'] = job.recipe_options job_data['linked_job'] = job.linked_job fillers = [] for filler_list in job.fillers.values(): for filler in filler_list: filler_data = {} filler_data['zone'] = filler.zone filler_data['num_cores'] = filler.num_cores filler_data['machine_states'] = [ (m.hostname, str(m.state)) for m in filler.machines] filler_data['state'] = str(filler.state) fillers.append(filler_data) job_data['fillers'] = fillers job_data['fill'] = job_fill.get(job.name, {}) fill_machines = job_machine_fill.get(job.name, {}) for zone in fill_machines.keys(): fill_machines[zone] = [str(m) for m in fill_machines[zone]] job_data['fill_machines'] = fill_machines job_data['spawning'] = job.currently_spawning jobs.append(job_data) data['jobs'] = jobs load = os.getloadavg() data['load_one_min'] = load[0] data['load_five_min'] = load[1] data['load_fifteen_min'] = load[2] py_threads = threading.enumerate() alive_thread_names = [t.getName() for t in py_threads] threads = {} std_threads = ['MainThread', 'Calculator', 'HTTPServer'] for i in range(self.harness.worker_thread_count): std_threads.append("Monitoring-%s" % i) for name in std_threads: if name in alive_thread_names: threads[name] = True alive_thread_names.remove(name) else: threads[name] = False other_threads = {} for name in alive_thread_names: other_threads[name] = True data['std_threads'] = std_threads data['threads'] = threads data['other_threads'] = other_threads return data