class JobRunner(Greenlet): def __init__(self, scheduler, job): super(JobRunner, self).__init__() self.scheduler = scheduler self.job = job self.client = RavelloClient(self.api_url, self.api_user, self.api_password) self.logger = logging.getLogger('ciserver.JobRunner') # Of course this could be made extensible to support multiple # controller types, repositories and result types. ctrl = self.job['environment']['controller'] if ctrl['type'] != 'ssh': raise ValueError('Unkown controller type: %s' % ctrl['type']) repo = self.job['project']['repo'] if repo['type'] != 'git': raise ValueError('Unknown repository type: %s' % repo['type']) results = self.job['project']['results'] if results['type'] != 'github': raise ValueError('Unknown repository type: %s' % results['type']) @classmethod def set_api_parameters(cls, url, user, password): cls.api_url = url cls.api_user = user cls.api_password = password def provision_blueprint(self): """Provision a Blueprint. Return the application instance ID.""" # We don't really deploy the new Blueprint. As a POC hack we look for # a running app instance that was created from the Blueprint that we're # interested in. name = self.job['environment']['blueprint'] apps = self.client.get_applications_metadata() for app in apps: if app.blueprintName == name: break else: raise JobFailed('Blueprint not found: %s' % name) self.appid = app.id def run_tests(self): """Run the tests via the controller.""" jobid = self.job['id'] logger = self.logger ctrl = self.job['environment']['controller'] vms = self.client.meta_vms(self.appid).vms for vm in vms: if vm.name.lower() == ctrl['host'].lower(): break else: raise JobFailed('Controller VM not found: %s' % name) host = vm.vmDynamicMD.fullyQualifiedDomainName repo = self.job['project']['repo'] logger.debug('[job %s] Running tests via "ssh" controller', jobid) logger.debug('[job %s] Controller node = %s', jobid, host) ssh = winpexpect.spawn('ssh-agent sh', timeout=30) try: # Forward the repository key. keyfile = os.path.join(self.scheduler.directory, 'keys', repo['key']) ssh.expect('[$#]') ssh.send('ssh-add %s\n' % keyfile) ssh.expect('[$#]') ssh.send('ssh -A ravello@%s\n' % host) # Install a more distinctive prompt that hopefully does not occur # in the output of any command we run. ssh.expect('[#$]') prompt = 'CITestHost: ' ssh.send('PS1="%s"\n' % prompt) ssh.expect(prompt) # echo ssh.expect(prompt) # prompt # Check out the commit in a temporary directory url = repo['url'] parsed = urlparse.urlsplit(url) #repodir = parsed.path.rstrip('/').split('/')[-1].rstrip('.git') logger.debug('[job %s] cloning source code from %s', jobid, url) dirname = os.urandom(8).encode('hex') ssh.send('mkdir %s\n' % dirname) ssh.expect(prompt) ssh.send('cd %s\n' % dirname) ssh.expect(prompt) ssh.send('git clone %s\n' % url) ssh.expect(prompt) ssh.send('cd *\n') ssh.expect(prompt) commit = self.job['commit']['id'] logger.debug('[job %s] checkout commit %s', jobid, commit) ssh.send('git checkout %s\n' % commit) ssh.expect(prompt) # And run the tests! command = ctrl['command'] logger.debug('[job %s] running test command "%s"', jobid, command) ssh.send('%s\n' % command) ssh.settimeout(600) ssh.expect('\r\n') # echo ssh.expect(prompt) output = ssh.before[:ssh.before.rfind('\r\n')] # strip prompt ssh.settimeout(30) ssh.send('echo $?\n') ssh.expect('\r\n') # echo ssh.expect('\r\n') # end of output status = int(ssh.before) ssh.expect(prompt) # end of output ssh.send('cd ../..\n') ssh.expect(prompt) ssh.send('rm -rf %s\n' % dirname) ssh.expect(prompt) ssh.send(ssh.cchar('VEOF')) # exit ssh ssh.expect('[$#]') ssh.send(ssh.cchar('VEOF')) # exit ssh-agent ssh.wait(10) except (TIMEOUT, EOF): logger.debug('[job %s] failed to run the test', jobid) ssh.terminate() raise JobFailed('Failed to run test job') logger.debug('[job %s] test return code: %s', jobid, status) self.status = status self.output = output commit_ok = 'This commit passed all tests in environment "%(environment)s".' issue_title = 'CI Test Error for commit %(commit)s' issue_body = textwrap.dedent("""\ Test suite error for commit %(commit)s for environment "%(environment)s". The output of the test suite is: %(output)s """) def publish_results(self): """Store the output of the test job.""" env = self.job['environment'] commit = self.job['commit'] results = self.job['project']['results'] subst = { 'commit': commit['id'], 'environment': env['blueprint'] } subst['output'] = ' ' + self.output.replace('\r\n', '\r\n ') client = GithubClient(results['username'], results['repository'], results['token']) client.connect() if self.status == 0: message = self.commit_ok % subst client.add_comment_to_commit(commit['id'], message) else: title = self.issue_title % subst body = self.issue_body % subst client.add_issue(title, body) client.close() def _run(self): logger = self.logger jobid = self.job['id'] logger.debug('[job %s] Running tests for project %s', jobid, self.job['project']['name']) try: self.provision_blueprint() self.run_tests() self.publish_results() except JobFailed as e: self.logger.debug('[job %s] Failed with: %s', jobid, e[0]) self.scheduler.job_done(self.job, 'FAILED', e[0], e[1]) except Exception as e: lines = ['An uncaught exception occurred\n'] lines += traceback.format_exception(*sys.exc_info()) detail = ''.join(lines) self.logger.debug('[job %s] Uncaught exception', jobid) self.logger.debug(detail) self.scheduler.job_done(self.job, 'FAILED', 'Uncaught exception', detail) else: self.logger.debug('[job %s] Completed successfully', jobid) self.scheduler.job_done(self.job, 'OK', 'Job completed successfully')
class Scheduler(Greenlet): """Job scheduler.""" # Keep max_instances_per_blueprint to 1 until the "do not really # provision blueprint" hack/optimization is removed. max_vms_per_project = 10 max_total_vms = 100 max_instances_per_blueprint = 1 def __init__(self, directory): super(Scheduler, self).__init__() self.directory = directory self.next_id = 1000 self.event_queue = [] self.have_events = Event() self.projects = {} self.job_queue = collections.deque() self.running = {} self.project_usage = {} self.blueprint_usage = {} self.total_vms = 0 self.logger = logging.getLogger('ciserver.Scheduler') self.client = RavelloClient(self.api_url, self.api_user, self.api_password) self.load_projects() self.load_job_queue() @classmethod def set_api_parameters(cls, url, user, password): cls.api_url = url cls.api_user = user cls.api_password = password def load_projects(self): dirname = os.path.join(self.directory, 'projects') for fname in os.listdir(dirname): if not fname.endswith('.js'): continue absname = os.path.join(dirname, fname) try: project = Project.load(absname) except (IOError, TypeError, ValueError): self.logger.debug('Could not load project: %s', fname) continue self.projects[project['name']] = project self.logger.debug('Loaded %s projects', len(self.projects)) def load_job_queue(self): dirname = os.path.join(self.directory, 'jobs') for fname in sorted(os.listdir(dirname)): if not fname.endswith('.js'): continue absname = os.path.join(dirname, fname) try: job = TestJob.load(absname) except (IOError, TypeError, ValueError): self.logger.debug('Could not load job: %s', fname) continue if job['status'] == 'NEW': self.job_queue.append(job) self.next_id = max(self.next_id, job['id']+1) self.logger.debug('Loaded %s jobs', len(self.job_queue)) def schedule_run(self, project, request): if project not in self.projects: return False project = self.projects[project] for env in project['environments']: for commit in request['commits']: job = TestJob() job['project'] = project job['request'] = request job['environment'] = env job['commit'] = commit self.add_job(job) return True def add_job(self, job): self.event_queue.append(('AddJob', (job,))) self.have_events.set() def job_done(self, job, result, message, detail=''): self.event_queue.append(('JobDone', (job, result, message, detail))) self.have_events.set() def _job_filename(self, job): return os.path.join(self.directory, 'jobs', '%010d.js' % job['id']) def _get_blueprint_allocation(self, name): """Return the VMs for a blueprint.""" apps = self.client.get_applications_metadata() for app in apps: if app.blueprintName == name: return app.numStartedVms else: return -1 def _process_events(self): logger = self.logger while self.event_queue: event, args = self.event_queue.pop(0) logger.debug('Handling event %s', event) if event == 'AddJob': job = args[0] job['id'] = self.next_id; self.next_id += 1 vms = self._get_blueprint_allocation(job['environment']['blueprint']) if vms == -1: logger.error('Could not add job %s', job['id']) continue job['vms'] = vms job['status'] = 'NEW' fname = self._job_filename(job) try: job.save(fname) except IOError: logger.error('Could not add job %s', job['id']) continue self.job_queue.append(job) elif event == 'JobDone': job, result, message, detail = args job['status'] = 'DONE' job['result'] = { 'result': result, 'message': message, 'detail': detail } fname = self._job_filename(job) try: job.save(fname) except IOError: logger.error('Could not update job %s', job['id']) assert job['id'] in self.running del self.running[job['id']] self.project_usage[job['project']['name']] -= job['vms'] self.blueprint_usage[job['environment']['blueprint']] -= 1 logger.debug('Done processing events') def _run_jobs(self): """A very simple scheduler that can enforce per project caps on VMs and blueprints, and a cap on total VMs. """ logger = self.logger while True: if not self.job_queue: logger.debug('Job queue empty') break logger.debug('Trying to schedule job from job queue') # Find the first job whose project is below its cap and # whose blueprint is runnable. for ix,job in enumerate(self.job_queue): project = job['project']['name'] if self.project_usage.get(project, 0) + job['vms'] \ > self.max_vms_per_project: continue blueprint = job['environment']['blueprint'] if self.blueprint_usage.get(blueprint, 0) + 1 \ > self.max_instances_per_blueprint: continue break # If we are running againt the global cap just wait. Don't be # smart and try to run other vms as that could starve the current # candidate. logger.debug('Job candidate: %s', job['id']) if self.total_vms + job['vms'] > self.max_total_vms: logger.debug('Would run over global cap, not any jobs') break logger.debug('Still below global cap, running job %s', job['id']) # Update allocations if project not in self.project_usage: self.project_usage[project] = 0 self.project_usage[project] += job['vms'] if blueprint not in self.blueprint_usage: self.blueprint_usage[blueprint] = 0 self.blueprint_usage[blueprint] += 1 self.total_vms += job['vms'] del self.job_queue[ix] self.running[job['id']] = job # And finally run the job runner = JobRunner(self, job) runner.start() logger.debug('Created JobRunner to run job') def _run(self): self.have_events.set() self.logger.debug('Entering _run() loop') while True: self.have_events.wait() self.have_events.clear() self._process_events() self._run_jobs() self.logger.debug('_run() loop exited')