def handle_canceling(self): for job in TestJob.objects.filter(status=TestJob.CANCELING, is_pipeline=True): worker_host = job.lookup_worker if job.dynamic_connection else job.actual_device.worker_host if not worker_host: self.logger.warning("[%d] Invalid worker information", job.id) # shouldn't happen fail_job(job, 'invalid worker information', TestJob.CANCELED) continue self.logger.info("[%d] CANCEL => %s", job.id, worker_host.hostname) self.controler.send_multipart([str(worker_host.hostname), 'CANCEL', str(job.id)])
def process_jobs(self, options): for job in TestJob.objects.filter( Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\ .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'): device = None worker_host = None device = select_device(job, self.dispatchers) if not device: # e.g. one or more jobs in the MultiNode group do not yet have Reserved devices. continue # selecting device can change the job job.refresh_from_db() self.logger.info("[%d] Assigning %s device", job.id, device) if job.actual_device is None: # health checks device = job.requested_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) continue # Launch the job create_job(job, device) self.logger.info("[%d] START => %s (%s)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host try: # Load job definition to get the variables for template # rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) # Load env.yaml, env-dut.yaml and dispatcher configuration # All three are optional env_str = load_optional_yaml_file(options['env']) env_dut_str = load_optional_yaml_file(options['env_dut']) # Load device configuration if device: device_configuration = device.load_configuration(job_ctx) dispatcher_config_file = os.path.join( options['dispatchers_config'], "%s.yaml" % worker_host.hostname) dispatcher_config = load_optional_yaml_file( dispatcher_config_file) self.controler.send_multipart([ str(worker_host.hostname), 'START', str(job.id), self.export_definition(job), str(device_configuration), dispatcher_config, env_str, env_dut_str ]) if job.is_multinode: # All secondary connections must be made from a dispatcher local to the one host device # to allow for local firewalls etc. So the secondary connection is started on the # remote worker of the "nominated" host. # This job will not be a dynamic_connection, this is the parent. device = None device_configuration = None # to get this far, the rest of the multinode group must also be ready # so start the dynamic connections parent = job for group_job in job.sub_jobs_list: if group_job == parent or not group_job.dynamic_connection: continue worker_host = parent.actual_device.worker_host dispatcher_config_file = os.path.join( options['dispatchers_config'], "%s.yaml" % worker_host.hostname) dispatcher_config = load_optional_yaml_file( dispatcher_config_file) # inherit only enough configuration for dynamic_connection operation device_configuration = parent.actual_device.load_configuration( job_ctx) self.logger.info( "[%d] Trimming dynamic connection device configuration.", group_job.id) device_configuration = parent.actual_device.minimise_configuration( device_configuration) self.logger.info("[%d] START => %s (connection)", group_job.id, worker_host.hostname) self.controler.send_multipart([ str(worker_host.hostname), 'START', str(group_job.id), self.export_definition(group_job), str(device_configuration), dispatcher_config, env_str, env_dut_str ]) continue except jinja2.TemplateNotFound as exc: self.logger.error("[%d] Template not found: '%s'", job.id, exc.message) msg = "Infrastructure error: Template not found: '%s'" % \ exc.message except jinja2.TemplateSyntaxError as exc: self.logger.error( "[%d] Template syntax error in '%s', line %d: %s", job.id, exc.name, exc.lineno, exc.message) msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \ (exc.name, exc.lineno, exc.message) except IOError as exc: self.logger.error("[%d] Unable to read '%s': %s", job.id, exc.filename, exc.strerror) msg = "Infrastructure error: cannot open '%s': %s" % \ (exc.filename, exc.strerror) except yaml.YAMLError as exc: self.logger.error("[%d] Unable to parse job definition: %s", job.id, exc) msg = "Infrastructure error: cannot parse job definition: %s" % \ exc self.logger.error("[%d] INCOMPLETE job", job.id) fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE)
def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error( "<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: # Save the description job = TestJob.objects.get(id=job_id) filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) # Update status. with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True
def process_jobs(self, options): for job in TestJob.objects.filter( Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\ .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'): if job.dynamic_connection: # A secondary connection must be made from a dispatcher local to the host device # to allow for local firewalls etc. So the secondary connection is started on the # remote worker of the "nominated" host. # FIXME: device = None worker_host = job.lookup_worker self.logger.info("[%d] START => %s (connection)", job.id, worker_host.hostname) else: device = select_device(job, self.dispatchers) if not device: continue # selecting device can change the job job = TestJob.objects.get(id=job.id) self.logger.info("[%d] Assigning %s device", job.id, device) if job.actual_device is None: device = job.requested_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) continue # Launch the job create_job(job, device) self.logger.info("[%d] START => %s (%s)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host else: device = job.actual_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) continue self.logger.info("[%d] START => %s (%s) (retrying)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host try: # Load job definition to get the variables for template # rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) # Load device configuration device_configuration = '' \ if job.dynamic_connection else device.load_device_configuration(job_ctx) # Load env.yaml, env-dut.yaml and dispatcher configuration # All three are optional env_str = load_optional_yaml_file(options['env']) env_dut_str = load_optional_yaml_file(options['env_dut']) dispatcher_config_file = os.path.join(options['dispatchers_config'], "%s.yaml" % worker_host.hostname) dispatcher_config = load_optional_yaml_file(dispatcher_config_file) if job.is_multinode: for group_job in job.sub_jobs_list: if group_job.dynamic_connection: # to get this far, the rest of the multinode group must also be ready # so start the dynamic connections # FIXME: rationalise and streamline self.controler.send_multipart( [str(worker_host.hostname), 'START', str(group_job.id), self.export_definition(group_job), str(device_configuration), dispatcher_config, env_str, env_dut_str]) self.controler.send_multipart( [str(worker_host.hostname), 'START', str(job.id), self.export_definition(job), str(device_configuration), dispatcher_config, env_str, env_dut_str]) continue except jinja2.TemplateNotFound as exc: self.logger.error("[%d] Template not found: '%s'", job.id, exc.message) msg = "Infrastructure error: Template not found: '%s'" % \ exc.message except jinja2.TemplateSyntaxError as exc: self.logger.error("[%d] Template syntax error in '%s', line %d: %s", job.id, exc.name, exc.lineno, exc.message) msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \ (exc.name, exc.lineno, exc.message) except IOError as exc: self.logger.error("[%d] Unable to read '%s': %s", job.id, exc.filename, exc.strerror) msg = "Infrastructure error: cannot open '%s': %s" % \ (exc.filename, exc.strerror) except yaml.YAMLError as exc: self.logger.error("[%d] Unable to parse job definition: %s", job.id, exc) msg = "Infrastructure error: cannot parse job definition: %s" % \ exc self.logger.error("[%d] INCOMPLETE job", job.id) fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE)
def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error("<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) # Save the description filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True
def process_jobs(self, options): for job in TestJob.objects.filter( Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\ .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'): if job.dynamic_connection: # A secondary connection must be made from a dispatcher local to the host device # to allow for local firewalls etc. So the secondary connection is started on the # remote worker of the "nominated" host. # FIXME: device = None worker_host = job.lookup_worker self.logger.info("[%d] START => %s (connection)", job.id, worker_host.hostname) else: device = select_device(job, self.dispatchers) if not device: return False # selecting device can change the job job = TestJob.objects.get(id=job.id) self.logger.info("[%d] Assigning %s device", job.id, device) if job.actual_device is None: device = job.requested_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) return False # Launch the job create_job(job, device) self.logger.info("[%d] START => %s (%s)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host else: device = job.actual_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) return False self.logger.info("[%d] START => %s (%s) (retrying)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host try: # Load job definition to get the variables for template # rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) # Load device configuration device_configuration = None \ if job.dynamic_connection else device.load_device_configuration(job_ctx) env_str = get_env_string(options['env']) env_dut_str = get_env_string(options['env_dut']) if job.is_multinode: for group_job in job.sub_jobs_list: if group_job.dynamic_connection: # to get this far, the rest of the multinode group must also be ready # so start the dynamic connections # FIXME: rationalise and streamline self.controler.send_multipart([ str(worker_host.hostname), 'START', str(group_job.id), self.export_definition(group_job), str(device_configuration), env_str, env_dut_str ]) self.controler.send_multipart([ str(worker_host.hostname), 'START', str(job.id), self.export_definition(job), str(device_configuration), env_str, env_dut_str ]) except (jinja2.TemplateError, IOError, yaml.YAMLError) as exc: if isinstance(exc, jinja2.TemplateNotFound): self.logger.error("Template not found: '%s'", exc.message) msg = "Infrastructure error: Template not found: '%s'" % \ exc.message elif isinstance(exc, jinja2.TemplateSyntaxError): self.logger.error( "Template syntax error in '%s', line %d: %s", exc.name, exc.lineno, exc.message) msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \ (exc.name, exc.lineno, exc.message) elif isinstance(exc, IOError): self.logger.error("Unable to read '%s': %s", options['env'], exc.strerror) msg = "Infrastructure error: cannot open '%s': %s" % \ (options['env'], exc.strerror) elif isinstance(exc, yaml.YAMLError): self.logger.error("Unable to parse job definition: %s", exc) msg = "Infrastructure error: cannot parse job definition: %s" % \ exc else: self.logger.exception(exc) msg = "Infrastructure error: %s" % exc.message self.logger.error("[%d] INCOMPLETE job", job.id) fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE) return True
def process_jobs(self, options): for job in TestJob.objects.filter( Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\ .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'): if job.dynamic_connection: # A secondary connection must be made from a dispatcher local to the host device # to allow for local firewalls etc. So the secondary connection is started on the # remote worker of the "nominated" host. # FIXME: worker_host = job.lookup_worker self.logger.info("[%d] START => %s (connection)", job.id, worker_host.hostname) else: device = select_device(job, self.dispatchers) if not device: return False # selecting device can change the job job = TestJob.objects.get(id=job.id) self.logger.info("[%d] Assigning %s device", job.id, device) if job.actual_device is None: device = job.requested_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) return False # Launch the job create_job(job, device) self.logger.info("[%d] START => %s (%s)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host else: device = job.actual_device if not device.worker_host: msg = "Infrastructure error: Invalid worker information" self.logger.error("[%d] %s", job.id, msg) fail_job(job, msg, TestJob.INCOMPLETE) return False self.logger.info("[%d] START => %s (%s) (retrying)", job.id, device.worker_host.hostname, device.hostname) worker_host = device.worker_host try: # Load job definition to get the variables for template # rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) # Load device configuration device_configuration = None \ if job.dynamic_connection else device.load_device_configuration(job_ctx) if job.is_multinode: for group_job in job.sub_jobs_list: if group_job.dynamic_connection: # to get this far, the rest of the multinode group must also be ready # so start the dynamic connections # FIXME: rationalise and streamline self.controler.send_multipart( [str(worker_host.hostname), 'START', str(group_job.id), self.export_definition(group_job), str(device_configuration), get_env_string(options['env']), get_env_string(options['env_dut'])]) self.controler.send_multipart( [str(worker_host.hostname), 'START', str(job.id), self.export_definition(job), str(device_configuration), get_env_string(options['env']), get_env_string(options['env_dut'])]) except (jinja2.TemplateError, IOError, yaml.YAMLError) as exc: if isinstance(exc, jinja2.TemplateNotFound): self.logger.error("Template not found: '%s'", exc.message) msg = "Infrastructure error: Template not found: '%s'" % \ exc.message elif isinstance(exc, jinja2.TemplateSyntaxError): self.logger.error("Template syntax error in '%s', line %d: %s", exc.name, exc.lineno, exc.message) msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \ (exc.name, exc.lineno, exc.message) elif isinstance(exc, IOError): self.logger.error("Unable to read '%s': %s", options['env'], exc.strerror) msg = "Infrastructure error: cannot open '%s': %s" % \ (options['env'], exc.strerror) elif isinstance(exc, yaml.YAMLError): self.logger.error("Unable to parse job definition: %s", exc) msg = "Infrastructure error: cannot parse job definition: %s" % \ exc else: self.logger.exception(exc) msg = "Infrastructure error: %s" % exc.message self.logger.error("[%d] INCOMPLETE job", job.id) job.status = TestJob.INCOMPLETE if job.dynamic_connection: job.failure_comment = msg job.save() else: new_status = Device.IDLE device.state_transition_to( new_status, message=msg, job=job) device.status = new_status device.current_job = None job.failure_comment = msg job.save() device.save() return True