示例#1
0
    def api_add_job(self, args):
        check = self._api_check(args,
                                ['dns_basename',
                                 'task_configuration',
                                 'deployment_layout',
                                 'deployment_recipe',
                                 'recipe_options',
                                 'persistent'])
        if check:
            return check
        job = ProductionJob(
            self,
            args['dns_basename'],
            args['task_configuration'],
            args['deployment_layout'],
            args['deployment_recipe'],
            args['recipe_options'],
            args['persistent'],
            args.get('linked_job'))

        if args.get('linked_job'):
            job.find_linked_job()
            if not job.linked_job_object:
                return "Couldn't find linked job!"

        if self.state.add_job(job):
            ClusterEventManager.handle(
                "Added a job: %s" % job.get_name())

            return "Job Added"
        else:
            return "Error adding job, see logs"
示例#2
0
    def api_update_logging_level(self, args):
        # Default level is INFO
        level = args.get('level', 20)
        for logger in self.state.loggers:
            logger.setLevel(level)

        ClusterEventManager.handle(
            "Updated logging level to %s" % level)

        return "Level set to %s" % level
示例#3
0
    def ensure_on_linked_job(self, state, sitter):
        """
        1. Ensure the linked job exists, if not bail out
        2. Ensure that this job is running on each machine
        that the linked job is on.  If not, create a job filler for
        those machines and this job.
        Note: As a linked job we should never create a job filler
        that spawns new machines.  We should always just be populating
        existing machines.
        """
        linked_job = self.find_linked_job()

        if not linked_job:
            logger.warn(
                "Couldn't find linked job (%s) for %s!" % (
                self.linked_job, str(self)))
            # Returning False stops all other jobs this cycle, which
            # we don't want to do.
            return True

        job_fill_machines = state.get_job_machines()
        for zone in linked_job.get_shared_fate_zones():
            machines_to_fill = []
            machines = job_fill_machines.get(zone, [])

            for machine in machines:
                task_names = [
                    task['name'] for task in machine.get_running_tasks()]

                if not self.name in task_names:
                    machines_to_fill.append(machine)

            current_fillers = self.fillers[zone]
            currently_spawning = 0
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            # Also check the linked job for active job fillers
            # we don't want to start a filler here if the linked job
            # is also actively filling, it should be sequential.
            current_fillers = linked_job.fillers[zone]
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            if not currently_spawning and len(machines_to_fill) > 0:
                ClusterEventManager.handle(
                    "New JobFiller for Linked Job: %s, %s, %s, %s" % (
                        machines_to_fill, zone, str(self), self.linked_job))

                filler = JobFiller(len(machines_to_fill), self,
                                   zone, machines_to_fill)
                filler.start()
                self.fillers[zone].append(filler)

        return True
示例#4
0
    def run(self):
        logger.info("Starting JobFiller")
        release_attempts = 1
        while self.state.get_state() != 8:
            state = self.state.get_state()
            logger.info(
                "Running State: %s, attempt #%s" % (
                str(self.state), release_attempts))

            try:
                if state == 0:
                    self.run_create_resources()
                elif state == 1:
                    self.ensure_dns()
                elif state == 2:
                    self.deploy_monitoring_code()
                elif state == 3:
                    self.deploy_job_code()
                elif state == 4:
                    self.launch_tasks()
                elif state == 5:
                    self.add_to_monitoring()
                elif state == 6:
                    self.ensure_dns(do_basename=True)
                elif state == 7:
                    self.reboot_dependent_jobs()
            except:
                release_attempts += 1
                import traceback
                traceback.print_exc()
                logger.error(traceback.format_exc())

                if release_attempts > 10 or self.fail_on_error:
                    logger.info("Job Filler: Failed")
                    ClusterEventManager.handle(
                        "Failed Filling: %s" % str(self))

                    if self.post_callback:
                        self.post_callback(self, success=False)

                    return False

        ClusterEventManager.handle(
            "Completed Filling: %s" % str(self))
        logger.info("Job Filler: Done!")
        self.end_time = datetime.now()

        if self in self.job.fillers.get(self.zone, []):
            self.job.fillers[self.zone].remove(self)

        if self.post_callback:
            self.post_callback(success=True)

        return True
示例#5
0
    def api_remove_job(self, args):
        check = self._api_check(args, ['name'])

        if check:
            return check

        jobs = self.state.remove_job(args['name'])
        ClusterEventManager.handle(
            "Removed jobs: %s" % ', '.join(jobs))
        if jobs:
            return "Removed: %s" % ', '.join(jobs)
        else:
            return "Job Not Found"
示例#6
0
    def api_update_job(self, args):
        check = self._api_check(args,
                                ['job_name'])

        if check:
            return check

        job_name = args['job_name']
        if not self.state.update_job(job_name):
            return "Error updating job: %s doesn't exist" % job_name

        ClusterEventManager.handle(
            'Update %s started' % job_name)
        return "Job update initiated"
示例#7
0
    def api_enforce_idle(self, args):
        # Really naive right now, a global # of
        # max idle per zone.  Could do a lot more here.
        check = self._api_check(args, ['idle_count_per_zone'])

        if check:
            return check

        try:
            self.state.max_idle_per_zone = int(args['idle_count_per_zone'])
        except:
            return "Invalid limit"

        ClusterEventManager.handle(
            "Enforce Idle Limit at %s" % int(args['idle_count_per_zone']))
        return "Limit set"
示例#8
0
    def decomission_machine(self, machine):
        self.state.remove_machine(machine)
        provider = self.state.get_zone_provider(
            machine.config.shared_fate_zone)
        if not provider:
            logger.warn(
                "No provider found for %s?" % machine.config.shared_fate_zone)
            return

        ClusterEventManager.handle(
            "Decomissioning %s" % str(machine))

        if not provider.decomission(machine):
            # If we can't decomission it then perhaps its locked
            # and we should leave well enough alone at this point,
            # just remove it from monitoring etc.
            ClusterEventManager.handle(
                "Provider doesn't allow decomissioning of %s" % str(machine))
            return

        if machine.config.dns_name:
            self.dns_provider.remove_record(data=machine.config.ip,
                                            hostName=machine.config.dns_name)

            # Strip off the leading number, e.g.
            # 12.bar.mydomain.com -> bar.mydomain.com
            root_name = '.'.join(machine.config.dns_name.split('.')[1:])

            self.dns_provider.remove_record(data=machine.config.ip,
                                            hostName=root_name)

        # Now look for other dangling records pointing to this machine
        # and delete those too.
        records = self.dns_provider.get_records()
        for record in records:
            if record['value'] == machine.config.ip:
                logger.info(
                    "Removing %s from %s" % (
                    machine.config.ip, record['record']))
                self.dns_provider.remove_record(data=machine.config.ip,
                                                hostName=record['record'])

        ClusterEventManager.handle(
            "Decomissioning of %s complete!" % str(machine))
示例#9
0
    def refill(self, state, sitter):
        self.sitter = sitter

        new_machines = False
        while self.sitter.machines_in_queue():
            new_machines = True
            # We want to ensure any machines recently added to monitoring
            # have had a chance to load their data, incase they are
            # running this job
            logger.info("Waiting for machine monitors to load machine data "
                        "before filling jobs")
            time.sleep(0.5)

        if new_machines:
            # If we had to wait for new machines that means that
            # there are new machines, and we need to recalculate
            # job fill before it is safe to do refill.  The next
            # pass should be OK.
            logger.info("Waiting for next jobfill to be calculated before "
                        "doing a refill")

            return False

        while not self.name in state.job_fill:
            # 1) Assume this job has already been added to state.jobs
            # 2) Want to ensure calculator has run at least once to find out
            #    if this job already exists throughout the cluster
            logger.info(
                "Waiting for calculator thread to kick in before "
                "filling jobs")
            time.sleep(0.5)

        # Clear out finished fillers after 5 minutes
        for zone, fillers in self.fillers.items():
            for filler in fillers:
                now = datetime.now()
                if (filler.is_done() and
                        now - filler.end_time > timedelta(minutes=5)):
                    logger.info(
                        "Removing a filler from %s for %s" % (
                        zone, self.name))
                    self.fillers[zone].remove(filler)

        # If we have a linked job then bypass all the normal logic
        # and just piggyback on those machines
        if self.linked_job:
            return self.ensure_on_linked_job(state, sitter)

        #!MACHINEASSUMPTION!
        # Step 1: Ensure we have enough machines in each SFZ
        # Step 1a: Check for idle machines and reserve as we find them
        for zone in self.get_shared_fate_zones():
            idle_available = state.get_idle_machines(zone)
            total_required = self.get_num_required_machines_in_zone(zone)
            idle_required = total_required - state.job_fill[self.name][zone]

            current_fillers = self.fillers[zone]
            currently_spawning = 0
            for filler in current_fillers:
                currently_spawning += filler.num_remaining()

            self.currently_spawning[zone] = currently_spawning

            idle_required -= currently_spawning

            # !MACHINEASSUMPTION! Ideally we're counting resources here
            # not machines
            required_new_machine_count = max(
                (idle_required - len(idle_available)), 0)

            do_log = logger.debug
            if idle_required > 0:
                do_log = logger.info

            do_log(
                ("Calculated job requirements for %s in %s: " % (self.name,
                                                                 zone)) +
                "Currently Active: %s " % (state.job_fill[self.name][zone]) +
                "Idle Required: %s, Total New: %s " % (
                    idle_required,
                    required_new_machine_count) +
                "Currently Spawning: %s " % (currently_spawning) +
                "idle-available: %s " % (len(idle_available)) +
                "total_required: %s " % (total_required)
            )

            usable_machines = []
            if required_new_machine_count <= 0:
                # idle_available > idle_required, so use just as many
                # as we need
                usable_machines = idle_available[:idle_required]
            elif required_new_machine_count > 0:
                # Otherwise take all the available idle ones, and
                # we'll make more
                usable_machines.extend(idle_available)

            if idle_required > 0:
                ClusterEventManager.handle(
                    "New JobFiller: %s, %s, %s, %s" % (
                        idle_required, zone, str(self), usable_machines))

                filler = JobFiller(idle_required, self,
                                   zone, usable_machines)
                self.fillers[zone].append(filler)
                filler.start()

        return True
示例#10
0
    def get_live_data(self):
        data = {}

        data['events'] = ClusterEventManager.get_events()

        state = self.harness.state
        job_fill, job_machine_fill = state.current_jobs.get_job_fill()
        idle_machines = []
        zoned_idle_machines = state.get_machines(idle=True)
        for zone, machines in zoned_idle_machines.iteritems():
            idle_machines.extend(machines)

        data['providers'] = state.get_providers().keys()
        data['machines_by_zone'] = str(state.get_machines())
        data['job_fill'] = str(job_fill)
        data['idle_machines'] = str(state.get_machines(idle=True))
        data['unreachable_machines'] = [
            str(m) for m in state.get_machines(unreachable=True)]

        monitors = []
        machines = []

        for monitor, thread in state.monitors:
            monitor_data = {}
            monitor_data['monitored_machines'] = [
                repr(m) for m in monitor.monitored_machines]
            monitor_data['add_queue'] = [repr(m) for m in monitor.add_queue]
            monitor_data['pull_failures'] = dict([
                (str(k), v) for k, v in monitor.pull_failures.iteritems()])
            monitor_data['failure_threshold'] = monitor.failure_threshold
            monitor_data['number'] = monitor.number
            monitors.append(monitor_data)

            pull_failures = dict(monitor.pull_failures)

            for machine in monitor.monitored_machines:
                machine_data = machine.serialize()
                machine_data['pull_failures'] = pull_failures.get(machine, 0)
                machine_data['idle'] = machine in (zoned_idle_machines.get(
                    machine_data['config']['shared_fate_zone'], []))

                machines.append(machine_data)

        data['machines'] = machines
        data['monitors'] = monitors

        jobs = []
        check_jobs = state.jobs.values() + state.repair_jobs.values()
        for job in check_jobs:
            job_data = {}
            job_data['name'] = job.name
            job_data['dns_basename'] = job.dns_basename
            job_data['task_configuration'] = job.task_configuration
            job_data['deployment_layout'] = job.deployment_layout
            job_data['deployment_recipe'] = job.deployment_recipe
            job_data['recipe_options'] = job.recipe_options
            job_data['linked_job'] = job.linked_job
            fillers = []
            for filler_list in job.fillers.values():
                for filler in filler_list:
                    filler_data = {}
                    filler_data['zone'] = filler.zone
                    filler_data['num_cores'] = filler.num_cores
                    filler_data['machine_states'] = [
                        (m.hostname, str(m.state)) for m in filler.machines]
                    filler_data['state'] = str(filler.state)
                    fillers.append(filler_data)

            job_data['fillers'] = fillers
            job_data['fill'] = job_fill.get(job.name, {})

            fill_machines = job_machine_fill.get(job.name, {})
            for zone in fill_machines.keys():
                fill_machines[zone] = [str(m) for m in fill_machines[zone]]

            job_data['fill_machines'] = fill_machines

            job_data['spawning'] = job.currently_spawning
            jobs.append(job_data)

        data['jobs'] = jobs

        load = os.getloadavg()
        data['load_one_min'] = load[0]
        data['load_five_min'] = load[1]
        data['load_fifteen_min'] = load[2]

        py_threads = threading.enumerate()
        alive_thread_names = [t.getName() for t in py_threads]
        threads = {}

        std_threads = ['MainThread', 'Calculator', 'HTTPServer']
        for i in range(self.harness.worker_thread_count):
            std_threads.append("Monitoring-%s" % i)

        for name in std_threads:
            if name in alive_thread_names:
                threads[name] = True
                alive_thread_names.remove(name)
            else:
                threads[name] = False

        other_threads = {}
        for name in alive_thread_names:
            other_threads[name] = True

        data['std_threads'] = std_threads
        data['threads'] = threads
        data['other_threads'] = other_threads
        return data