def recreate_virtual_device(): """Recreate virtual device from image.""" device_name = environment.get_value('BOT_NAME') failure_wait_interval = environment.get_value('FAIL_WAIT') project = environment.get_value('GCE_PROJECT') retry_limit = environment.get_value('FAIL_RETRIES') zone = environment.get_value('GCE_ZONE') # This is needed to populate the initial /data partition. We use a separate # disk for /data since the one in the provided images is only 2GB. preimage_metadata_value = environment.get_value('GCE_DATA_PREIMAGE_METADATA') if preimage_metadata_value: additional_metadata = {GCE_PREIMAGE_METADATA_KEY: preimage_metadata_value} else: additional_metadata = None for _ in xrange(retry_limit): if compute_engine.recreate_instance_with_disks( device_name, project, zone, additional_metadata=additional_metadata, wait_for_completion=True): # Instance recreation succeeeded. Try reconnecting after some wait. time.sleep(REMOTE_RECREATE_TIMEOUT) if connect_remote(reconnect=True, num_retries=REMOTE_CONNECT_RETRIES * 2): # We were able to successfully reconnect to device after recreation. return True time.sleep(utils.random_number(1, failure_wait_interval)) logs.log_error('Failed to reimage device.') return False
def task_loop(): """Executes tasks indefinitely.""" # Defer heavy task imports to prevent issues with multiprocessing.Process from bot.tasks import commands clean_exit = False while True: stacktrace = '' exception_occurred = False task = None # This caches the current environment on first run. Don't move this. environment.reset_environment() try: # Run regular updates. update_task.run() update_task.track_revision() task = tasks.get_task() if not task: continue with _Monitor(task): with task.lease(): # Execute the command and delete the task. commands.process_command(task) except SystemExit as e: exception_occurred = True clean_exit = (e.code == 0) if not clean_exit and not isinstance(e, untrusted.HostException): logs.log_error('SystemExit occurred while working on task.') stacktrace = traceback.format_exc() except commands.AlreadyRunningError: exception_occurred = False except Exception: logs.log_error('Error occurred while working on task.') exception_occurred = True stacktrace = traceback.format_exc() if exception_occurred: # Prevent looping too quickly. See: crbug.com/644830 failure_wait_interval = environment.get_value('FAIL_WAIT') time.sleep(utils.random_number(1, failure_wait_interval)) break task_payload = task.payload() if task else None return stacktrace, clean_exit, task_payload
def needs_update(revision_file, revision): """Check a revision file against the provided revision to see if an update is required.""" failure_wait_interval = environment.get_value('FAIL_WAIT') file_exists = False retry_limit = environment.get_value('FAIL_RETRIES') for _ in range(retry_limit): # NFS can sometimes return a wrong result on file existence, so redo # this check a couple of times to be sure. if not os.path.exists(revision_file): file_exists = False time.sleep(15) continue # Found the file, now try to read its contents. file_exists = True try: file_handle = open(revision_file, 'r') current_revision = file_handle.read() file_handle.close() except: logs.log_error('Error occurred while reading revision file %s.' % revision_file) time.sleep(utils.random_number(1, failure_wait_interval)) continue if current_revision.isdigit(): return int(revision) > int(current_revision) return str(revision) != str(current_revision) # If there is no revision file or if we have lost track of its revision, # then we do need to update the data bundle. if not file_exists: return True # An error has occurred and we have failed to read revision file # despite several retries. So, don't bother updating the data # bundle as it will probably fail as well. logs.log_error('Failed to read revision file, exiting.') return False
def update_task_status(task_name, status, expiry_interval=None): """Updates status for a task. Used to ensure that a single instance of a task is running at any given time.""" bot_name = environment.get_value('BOT_NAME') failure_wait_interval = environment.get_value('FAIL_WAIT') # If we didn't get an expiry interval, default to our task lease interval. if expiry_interval is None: expiry_interval = environment.get_value('TASK_LEASE_SECONDS') if expiry_interval is None: logs.log_error('expiry_interval is None and TASK_LEASE_SECONDS not set.') def _try_update_status(): """Try update metadata.""" task_status = get_task_status(task_name, create_if_needed=True) # If another bot is already working on this task, bail out with error. if (status == data_types.TaskState.STARTED and task_status.status == data_types.TaskState.STARTED and not dates.time_has_expired( task_status.time, seconds=expiry_interval - 1)): return False task_status.bot_name = bot_name task_status.status = status task_status.time = utils.utcnow() task_status.put() return True # It is important that we do not continue until the metadata is updated. # This can lead to task loss, or can cause issues with multiple bots # attempting to run the task at the same time. while True: try: return ndb.transaction(_try_update_status, retries=0) except Exception: # We need to update the status under all circumstances. # Failing to update 'completed' status causes another bot # that picked up this job to bail out. logs.log_error('Unable to update %s task metadata. Retrying.' % task_name) time.sleep(utils.random_number(1, failure_wait_interval))
def process_command(task): """Figures out what to do with the given task and executes the command.""" logs.log("Executing command '%s'" % task.payload()) if not task.payload().strip(): logs.log_error('Empty task received.') return # Parse task payload. task_name = task.command task_argument = task.argument job_name = task.job environment.set_value('TASK_NAME', task_name) environment.set_value('TASK_ARGUMENT', task_argument) environment.set_value('JOB_NAME', job_name) if job_name != 'none': job = data_types.Job.query(data_types.Job.name == job_name).get() # Job might be removed. In that case, we don't want an exception # raised and causing this task to be retried by another bot. if not job: logs.log_error("Job '%s' not found." % job_name) return if not job.platform: error_string = "No platform set for job '%s'" % job_name logs.log_error(error_string) raise errors.BadStateError(error_string) # A misconfiguration led to this point. Clean up the job if necessary. job_queue_suffix = tasks.queue_suffix_for_platform(job.platform) bot_queue_suffix = tasks.default_queue_suffix() if job_queue_suffix != bot_queue_suffix: # This happens rarely, store this as a hard exception. logs.log_error( 'Wrong platform for job %s: job queue [%s], bot queue [%s].' % (job_name, job_queue_suffix, bot_queue_suffix)) # Try to recreate the job in the correct task queue. new_queue = (tasks.high_end_queue() if task.high_end else tasks.regular_queue()) new_queue += job_queue_suffix # Command override is continuously run by a bot. If we keep failing # and recreating the task, it will just DoS the entire task queue. # So, we don't create any new tasks in that case since it needs # manual intervention to fix the override anyway. if not task.is_command_override: try: tasks.add_task(task_name, task_argument, job_name, new_queue) except Exception: # This can happen on trying to publish on a non-existent topic, e.g. # a topic for a high-end bot on another platform. In this case, just # give up. logs.log_error('Failed to fix platform and re-add task.') # Add a wait interval to avoid overflowing task creation. failure_wait_interval = environment.get_value('FAIL_WAIT') time.sleep(failure_wait_interval) return if task_name != 'fuzz': # Make sure that our platform id matches that of the testcase (for # non-fuzz tasks). testcase = data_handler.get_entity_by_type_and_id( data_types.Testcase, task_argument) if testcase: current_platform_id = environment.get_platform_id() testcase_platform_id = testcase.platform_id # This indicates we are trying to run this job on the wrong platform. # This can happen when you have different type of devices (e.g # android) on the same platform group. In this case, we just recreate # the task. if (task_name != 'variant' and testcase_platform_id and not utils.fields_match( testcase_platform_id, current_platform_id)): logs.log( 'Testcase %d platform (%s) does not match with ours (%s), exiting' % (testcase.key.id(), testcase_platform_id, current_platform_id)) tasks.add_task(task_name, task_argument, job_name, wait_time=utils.random_number( 1, TASK_RETRY_WAIT_LIMIT)) return # Some fuzzers contain additional environment variables that should be # set for them. Append these for tests generated by these fuzzers and for # the fuzz command itself. fuzzer_name = None if task_name == 'fuzz': fuzzer_name = task_argument elif testcase: fuzzer_name = testcase.fuzzer_name # Get job's environment string. environment_string = job.get_environment_string() if task_name == 'minimize': # Let jobs specify a different job and fuzzer to minimize with. job_environment = job.get_environment() minimize_job_override = job_environment.get( 'MINIMIZE_JOB_OVERRIDE') if minimize_job_override: minimize_job = data_types.Job.query( data_types.Job.name == minimize_job_override).get() if minimize_job: environment.set_value('JOB_NAME', minimize_job_override) environment_string = minimize_job.get_environment_string() environment_string += '\nORIGINAL_JOB_NAME = %s\n' % job_name job_name = minimize_job_override else: logs.log_error('Job for minimization not found: %s.' % minimize_job_override) # Fallback to using own job for minimization. minimize_fuzzer_override = job_environment.get( 'MINIMIZE_FUZZER_OVERRIDE') fuzzer_name = minimize_fuzzer_override or fuzzer_name if fuzzer_name and not environment.is_engine_fuzzer_job(job_name): fuzzer = data_types.Fuzzer.query( data_types.Fuzzer.name == fuzzer_name).get() additional_default_variables = '' additional_variables_for_job = '' if (fuzzer and hasattr(fuzzer, 'additional_environment_string') and fuzzer.additional_environment_string): for line in fuzzer.additional_environment_string.splitlines(): # Job specific values may be defined in fuzzer additional # environment variable name strings in the form # job_name:VAR_NAME = VALUE. if '=' in line and ':' in line.split('=', 1)[0]: fuzzer_job_name, environment_definition = line.split( ':', 1) if fuzzer_job_name == job_name: additional_variables_for_job += '\n%s' % environment_definition continue additional_default_variables += '\n%s' % line environment_string += additional_default_variables environment_string += additional_variables_for_job # Update environment for the job. update_environment_for_job(environment_string) # Match the cpu architecture with the ones required in the job definition. # If they don't match, then bail out and recreate task. if not is_supported_cpu_arch_for_job(): logs.log( 'Unsupported cpu architecture specified in job definition, exiting.' ) tasks.add_task(task_name, task_argument, job_name, wait_time=utils.random_number(1, TASK_RETRY_WAIT_LIMIT)) return # Initial cleanup. cleanup_task_state() start_web_server_if_needed() try: run_command(task_name, task_argument, job_name) finally: # Final clean up. cleanup_task_state()