def run(self, round_number): """Run the testcase once.""" app_directory = environment.get_value('APP_DIR') warmup_timeout = environment.get_value('WARMUP_TIMEOUT') run_timeout = warmup_timeout if round_number == 1 else self._test_timeout if self._is_black_box: return_code, crash_time, output = process_handler.run_process( self._command, timeout=run_timeout, gestures=self._gestures, current_working_directory=app_directory) else: result = engine_reproduce(self._engine_impl, self._fuzz_target.binary, self._testcase_path, self._arguments, run_timeout) return_code = result.return_code crash_time = result.time_executed log_header = engine_common.get_log_header( result.command, environment.get_value('BOT_NAME'), result.time_executed) output = log_header + '\n' + result.output process_handler.terminate_stale_application_instances() crash_result = CrashResult(return_code, crash_time, output) if not crash_result.is_crash(): logs.log('No crash occurred (round {round_number}).'.format( round_number=round_number), output=output) return crash_result
def _pre_run_cleanup(self): """Common cleanup before running a testcase.""" # Cleanup any existing application instances and user profile directories. # Cleaning up temp user profile directories. Should be done before calling # |get_command_line_for_application| call since that creates dependencies in # the profile folder. process_handler.terminate_stale_application_instances() shell.clear_temp_directory()
def beat(previous_state, log_filename): """Run a cycle of heartbeat checks to ensure bot is running.""" # Handle case when run_bot.py script is stuck. If yes, kill its process. task_end_time = tasks.get_task_end_time() if psutil and task_end_time and dates.time_has_expired( task_end_time, seconds=tasks.TASK_COMPLETION_BUFFER): # Get absolute path to |run_bot| script. We use this to identify unique # instances of bot running on a particular host. startup_scripts_directory = environment.get_startup_scripts_directory() bot_file_path = os.path.join(startup_scripts_directory, 'run_bot') for process in psutil.process_iter(): try: command_line = ' '.join(process.cmdline()) except (psutil.AccessDenied, psutil.NoSuchProcess, OSError): sys.exc_clear() continue # Find the process running the main bot script. if bot_file_path not in command_line: continue process_id = process.pid logs.log('Killing stale bot (pid %d) which seems to have stuck.' % process_id) try: process_handler.terminate_root_and_child_processes(process_id) except Exception: logs.log_error('Failed to terminate stale bot processes.') # Minor cleanup to avoid disk space issues on bot restart. process_handler.terminate_stale_application_instances() shell.clear_temp_directory() shell.clear_testcase_directories() # Concerned stale processes should be killed. Now, delete the stale task. tasks.track_task_end() # Figure out when the log file was last modified. try: current_state = str(os.path.getmtime(log_filename)) except Exception: current_state = None logs.log('Old state %s, current state %s.' % (previous_state, current_state)) # Only update the heartbeat if the log file was modified. if current_state and current_state != previous_state: # Try updating the heartbeat. If an error occurs, just # wait and return None. if not data_handler.update_heartbeat(): return None # Heartbeat is successfully updated. return current_state
def store_testcase_dependencies_from_bundled_testcase_archive( metadata, testcase, testcase_file_path): """Store testcase dependencies from a bundled testcase archive (with multiple testcases).""" # Nothing to do if this is not a bundled testcase archive with # multiple testcase. if not metadata.bundled: return 1 # Cleanup, before re-running the app to get resource list. process_handler.terminate_stale_application_instances() shell.clear_temp_directory() # Increase the test timeout to warmup timeout. This is needed # since we use warmup timeout in test_for_crash_with_retries. warmup_timeout = environment.get_value('WARMUP_TIMEOUT') environment.set_value('TEST_TIMEOUT', warmup_timeout) env_copy = environment.copy() crash_queue = process_handler.get_queue() thread_index = 0 thread = process_handler.get_process()( target=testcase_manager.run_testcase_and_return_result_in_queue, args=(crash_queue, thread_index, testcase_file_path, testcase.gestures, env_copy)) thread.start() thread.join(warmup_timeout) if thread.is_alive(): try: thread.terminate() except: logs.log_error('Process termination failed or was not needed.') # We should be able to reproduce this reproducible crash. If not, # something wrong happened and we would just try to redo task. if not crash_queue.empty(): crash = crash_queue.get() fuzzed_key, archived, absolute_path, archive_filename = ( setup.archive_testcase_and_dependencies_in_gcs(crash.resource_list, testcase_file_path)) if archived: testcase.archive_state = data_types.ArchiveStatus.FUZZED else: testcase.archive_state = 0 testcase.fuzzed_keys = fuzzed_key metadata.blobstore_key = fuzzed_key testcase.absolute_path = absolute_path if archive_filename: testcase.archive_filename = archive_filename metadata.filename = os.path.basename(archive_filename) else: metadata.filename = os.path.basename(testcase_file_path) metadata.bundled = False metadata.put() else: logs.log_error('Could not get crash data from queue. Retrying task.') tasks.add_task('analyze', testcase.key.id(), testcase.job_type) return None return 1
def test_terminate_stale_application_instances(self): """Test terminating stale application instances.""" # TODO(ochang): Improve this test once we use Docker. process_handler.terminate_stale_application_instances()
def TerminateStaleApplicationInstances(self, request, context): # pylint: disable=unused-argument process_handler.terminate_stale_application_instances() return untrusted_runner_pb2.TerminateStaleApplicationInstancesResponse( )
def check_for_bad_build(job_type, crash_revision): """Return true if the build is bad, i.e. crashes on startup.""" # Check the bad build check flag to see if we want do this. if not environment.get_value('BAD_BUILD_CHECK'): return False # Create a blank command line with no file to run and no http. command = get_command_line_for_application(file_to_run='', needs_http=False) # When checking for bad builds, we use the default window size. # We don't want to pick a custom size since it can potentially cause a # startup crash and cause a build to be detected incorrectly as bad. default_window_argument = environment.get_value('WINDOW_ARG', '') if default_window_argument: command = command.replace(' %s' % default_window_argument, '') # TSAN is slow, and boots slow on first startup. Increase the warmup # timeout for this case. if environment.tool_matches('TSAN', job_type): fast_warmup_timeout = environment.get_value('WARMUP_TIMEOUT') else: fast_warmup_timeout = environment.get_value('FAST_WARMUP_TIMEOUT') # Initialize helper variables. is_bad_build = False build_run_console_output = '' app_directory = environment.get_value('APP_DIR') # Exit all running instances. process_handler.terminate_stale_application_instances() # Check if the build is bad. return_code, crash_time, output = process_handler.run_process( command, timeout=fast_warmup_timeout, current_working_directory=app_directory) crash_result = CrashResult(return_code, crash_time, output) # 1. Need to account for startup crashes with no crash state. E.g. failed to # load shared library. So, ignore state for comparison. # 2. Ignore leaks as they don't block a build from reporting regular crashes # and also don't impact regression range calculations. if (crash_result.is_crash(ignore_state=True) and not crash_result.should_ignore() and not crash_result.get_type() in ['Direct-leak', 'Indirect-leak']): is_bad_build = True build_run_console_output = utils.get_crash_stacktrace_output( command, crash_result.get_stacktrace(symbolized=True), crash_result.get_stacktrace(symbolized=False)) logs.log('Bad build for %s detected at r%d.' % (job_type, crash_revision), output=build_run_console_output) # Exit all running instances. process_handler.terminate_stale_application_instances() # Any of the conditions below indicate that bot is in a bad state and it is # not caused by the build itself. In that case, just exit. build_state = data_handler.get_build_state(job_type, crash_revision) if is_bad_build and utils.sub_string_exists_in(BAD_STATE_HINTS, output): logs.log_fatal_and_exit( 'Bad bot environment detected, exiting.', output=build_run_console_output, snapshot=process_handler.get_runtime_snapshot()) # If none of the other bots have added information about this build, # then add it now. if (build_state == data_types.BuildState.UNMARKED and not crash_result.should_ignore()): data_handler.add_build_metadata(job_type, crash_revision, is_bad_build, build_run_console_output) return is_bad_build
def execute_task(testcase_id, job_type): """Execute a symbolize command.""" # Locate the testcase associated with the id. testcase = data_handler.get_testcase_by_id(testcase_id) # We should atleast have a symbolized debug or release build. if not build_manager.has_symbolized_builds(): return data_handler.update_testcase_comment(testcase, data_types.TaskState.STARTED) # Setup testcase and its dependencies. file_list, _, testcase_file_path = setup.setup_testcase(testcase, job_type) if not file_list: return # Initialize variables. build_fail_wait = environment.get_value("FAIL_WAIT") old_crash_stacktrace = data_handler.get_stacktrace(testcase) sym_crash_type = testcase.crash_type sym_crash_address = testcase.crash_address sym_crash_state = testcase.crash_state sym_redzone = DEFAULT_REDZONE warmup_timeout = environment.get_value("WARMUP_TIMEOUT") # Decide which build revision to use. if testcase.crash_stacktrace == "Pending": # This usually happen when someone clicked the 'Update stacktrace from # trunk' button on the testcase details page. In this case, we are forced # to use trunk. No revision -> trunk build. build_revision = None else: build_revision = testcase.crash_revision # Set up a custom or regular build based on revision. build_manager.setup_build(build_revision) # Get crash revision used in setting up build. crash_revision = environment.get_value("APP_REVISION") if not build_manager.check_app_path(): testcase = data_handler.get_testcase_by_id(testcase_id) data_handler.update_testcase_comment(testcase, data_types.TaskState.ERROR, "Build setup failed") tasks.add_task( "symbolize", testcase_id, job_type, wait_time=build_fail_wait) return # ASAN tool settings (if the tool is used). # See if we can get better stacks with higher redzone sizes. # A UAF might actually turn out to be OOB read/write with a bigger redzone. if environment.tool_matches("ASAN", job_type) and testcase.security_flag: redzone = MAX_REDZONE while redzone >= MIN_REDZONE: environment.reset_current_memory_tool_options( redzone_size=testcase.redzone, disable_ubsan=testcase.disable_ubsan) process_handler.terminate_stale_application_instances() command = testcase_manager.get_command_line_for_application( testcase_file_path, needs_http=testcase.http_flag) return_code, crash_time, output = process_handler.run_process( command, timeout=warmup_timeout, gestures=testcase.gestures) crash_result = CrashResult(return_code, crash_time, output) if crash_result.is_crash() and "AddressSanitizer" in output: state = crash_result.get_symbolized_data() security_flag = crash_result.is_security_issue() if (not crash_analyzer.ignore_stacktrace(state.crash_stacktrace) and security_flag == testcase.security_flag and state.crash_type == testcase.crash_type and (state.crash_type != sym_crash_type or state.crash_state != sym_crash_state)): logs.log("Changing crash parameters.\nOld : %s, %s, %s" % (sym_crash_type, sym_crash_address, sym_crash_state)) sym_crash_type = state.crash_type sym_crash_address = state.crash_address sym_crash_state = state.crash_state sym_redzone = redzone old_crash_stacktrace = state.crash_stacktrace logs.log("\nNew : %s, %s, %s" % (sym_crash_type, sym_crash_address, sym_crash_state)) break redzone /= 2 # We should have atleast a symbolized debug or a release build. symbolized_builds = build_manager.setup_symbolized_builds(crash_revision) if not symbolized_builds or ( not build_manager.check_app_path() and not build_manager.check_app_path("APP_PATH_DEBUG")): testcase = data_handler.get_testcase_by_id(testcase_id) data_handler.update_testcase_comment(testcase, data_types.TaskState.ERROR, "Build setup failed") tasks.add_task( "symbolize", testcase_id, job_type, wait_time=build_fail_wait) return # Increase malloc_context_size to get all stack frames. Default is 30. environment.reset_current_memory_tool_options( redzone_size=sym_redzone, malloc_context_size=STACK_FRAME_COUNT, symbolize_inline_frames=True, disable_ubsan=testcase.disable_ubsan, ) # TSAN tool settings (if the tool is used). if environment.tool_matches("TSAN", job_type): environment.set_tsan_max_history_size() # Do the symbolization if supported by this application. result, sym_crash_stacktrace = get_symbolized_stacktraces( testcase_file_path, testcase, old_crash_stacktrace, sym_crash_state) # Update crash parameters. testcase = data_handler.get_testcase_by_id(testcase_id) testcase.crash_type = sym_crash_type testcase.crash_address = sym_crash_address testcase.crash_state = sym_crash_state testcase.crash_stacktrace = data_handler.filter_stacktrace( sym_crash_stacktrace) if not result: data_handler.update_testcase_comment( testcase, data_types.TaskState.ERROR, "Unable to reproduce crash, skipping " "stacktrace update", ) else: # Switch build url to use the less-optimized symbolized build with better # stacktrace. build_url = environment.get_value("BUILD_URL") if build_url: testcase.set_metadata("build_url", build_url, update_testcase=False) data_handler.update_testcase_comment(testcase, data_types.TaskState.FINISHED) testcase.symbolized = True testcase.crash_revision = crash_revision testcase.put() # We might have updated the crash state. See if we need to marked as duplicate # based on other testcases. data_handler.handle_duplicate_entry(testcase) task_creation.create_blame_task_if_needed(testcase) # Switch current directory before builds cleanup. root_directory = environment.get_value("ROOT_DIR") os.chdir(root_directory) # Cleanup symbolized builds which are space-heavy. symbolized_builds.delete()
def get_symbolized_stacktraces(testcase_file_path, testcase, old_crash_stacktrace, expected_state): """Use the symbolized builds to generate an updated stacktrace.""" # Initialize variables. app_path = environment.get_value("APP_PATH") app_path_debug = environment.get_value("APP_PATH_DEBUG") long_test_timeout = environment.get_value("WARMUP_TIMEOUT") retry_limit = environment.get_value("FAIL_RETRIES") symbolized = False debug_build_stacktrace = "" release_build_stacktrace = old_crash_stacktrace # Symbolize using the debug build first so that the debug build stacktrace # comes after the more important release build stacktrace. if app_path_debug: for _ in range(retry_limit): process_handler.terminate_stale_application_instances() command = testcase_manager.get_command_line_for_application( testcase_file_path, app_path=app_path_debug, needs_http=testcase.http_flag, ) return_code, crash_time, output = process_handler.run_process( command, timeout=long_test_timeout, gestures=testcase.gestures) crash_result = CrashResult(return_code, crash_time, output) if crash_result.is_crash(): state = crash_result.get_symbolized_data() if crash_analyzer.ignore_stacktrace(state.crash_stacktrace): continue unsymbolized_crash_stacktrace = crash_result.get_stacktrace( symbolized=False) debug_build_stacktrace = utils.get_crash_stacktrace_output( command, state.crash_stacktrace, unsymbolized_crash_stacktrace, build_type="debug", ) symbolized = True break # Symbolize using the release build. if app_path: for _ in range(retry_limit): process_handler.terminate_stale_application_instances() command = testcase_manager.get_command_line_for_application( testcase_file_path, app_path=app_path, needs_http=testcase.http_flag) return_code, crash_time, output = process_handler.run_process( command, timeout=long_test_timeout, gestures=testcase.gestures) crash_result = CrashResult(return_code, crash_time, output) if crash_result.is_crash(): state = crash_result.get_symbolized_data() if crash_analyzer.ignore_stacktrace(state.crash_stacktrace): continue if state.crash_state != expected_state: continue # Release stack's security flag has to match the symbolized release # stack's security flag. security_flag = crash_result.is_security_issue() if security_flag != testcase.security_flag: continue unsymbolized_crash_stacktrace = crash_result.get_stacktrace( symbolized=False) release_build_stacktrace = utils.get_crash_stacktrace_output( command, state.crash_stacktrace, unsymbolized_crash_stacktrace, build_type="release", ) symbolized = True break stacktrace = release_build_stacktrace if debug_build_stacktrace: stacktrace += "\n\n" + debug_build_stacktrace return symbolized, stacktrace
def check_for_bad_build(job_type, crash_revision): """Return true if the build is bad, i.e. crashes on startup.""" # Check the bad build check flag to see if we want do this. if not environment.get_value('BAD_BUILD_CHECK'): return False # Do not detect leaks while checking for bad builds. environment.reset_current_memory_tool_options(leaks=False) # Create a blank command line with no file to run and no http. command = get_command_line_for_application(file_to_run='', needs_http=False) # When checking for bad builds, we use the default window size. # We don't want to pick a custom size since it can potentially cause a # startup crash and cause a build to be detected incorrectly as bad. default_window_argument = environment.get_value('WINDOW_ARG', '') if default_window_argument: command = command.replace(' %s' % default_window_argument, '') # Warmup timeout. fast_warmup_timeout = environment.get_value('FAST_WARMUP_TIMEOUT') # TSAN is slow, and boots slow on first startup. Increase the warmup # timeout for this case. if environment.tool_matches('TSAN', job_type): fast_warmup_timeout = environment.get_value('WARMUP_TIMEOUT') # Initialize helper variables. is_bad_build = False build_run_console_output = '' output = '' app_directory = environment.get_value('APP_DIR') # Check if the build is bad. process_handler.terminate_stale_application_instances() exit_code, _, output = process_handler.run_process( command, timeout=fast_warmup_timeout, current_working_directory=app_directory) output = utils.decode_to_unicode(output) if crash_analyzer.is_crash(exit_code, output): is_bad_build = True build_run_console_output = ( '%s\n\n%s\n\n%s' % (command, stack_symbolizer.symbolize_stacktrace(output), output)) logs.log('Bad build for %s detected at r%d.' % (job_type, crash_revision), output=build_run_console_output) # Exit all running instances. process_handler.terminate_stale_application_instances() # Any of the conditions below indicate that bot is in a bad state and it is # not caused by the build itself. In that case, just exit. build_state = data_handler.get_build_state(job_type, crash_revision) if (is_bad_build and ('cannot open display' in output or 'logging service has stopped' in output or 'Maximum number of clients reached' in output)): logs.log_fatal_and_exit('Bad bot environment detected, exiting.', output=build_run_console_output) # If none of the other bots have added information about this build, # then add it now. if build_state == data_types.BuildState.UNMARKED: data_handler.add_build_metadata(job_type, crash_revision, is_bad_build, build_run_console_output) # Reset memory tool options. environment.reset_current_memory_tool_options() return is_bad_build
def test_for_reproducibility(testcase_path, expected_state, expected_security_flag, test_timeout, http_flag, gestures): """Test to see if a crash is fully reproducible or is a one-time crasher.""" # Cleanup any existing application instances and user profile directories. # Cleaning up temp clears user profile directories and should be done before # calling |get_command_line_for_application| call since that creates # dependencies in the profile folder. process_handler.terminate_stale_application_instances() shell.clear_temp_directory() app_directory = environment.get_value('APP_DIR') command = get_command_line_for_application(testcase_path, needs_http=http_flag) crash_count = 0 crash_retries = environment.get_value('CRASH_RETRIES') reproducible_crash_target_count = crash_retries * REPRODUCIBILITY_FACTOR warmup_timeout = environment.get_value('WARMUP_TIMEOUT') logs.log('Testing for crash (command="%s").' % command) round_number = 0 for round_number in xrange(1, crash_retries + 1): # Bail out early if there is no hope of finding a reproducible crash. if (crash_retries - round_number + crash_count + 1 < reproducible_crash_target_count): break run_timeout = warmup_timeout if round_number == 1 else test_timeout return_code, crash_time, output = process_handler.run_process( command, timeout=run_timeout, gestures=gestures, current_working_directory=app_directory) process_handler.terminate_stale_application_instances() crash_result = CrashResult(return_code, crash_time, output) if not crash_result.is_crash(): continue state = crash_result.get_symbolized_data() crash_state = state.crash_state security_flag = crash_result.is_security_issue() # If we don't have an expected crash state, set it to the one from initial # crash. if not expected_state: expected_state = crash_state if security_flag != expected_security_flag: logs.log('Detected a crash without the correct security flag.') continue crash_comparer = CrashComparer(crash_state, expected_state) if not crash_comparer.is_similar(): logs.log('Detected a crash with an unrelated state: ' 'Expected(%s), Found(%s).' % (expected_state, crash_state)) continue crash_count += 1 if crash_count >= reproducible_crash_target_count: logs.log('Crash is reproducible.') return True logs.log('Crash is not reproducible. Crash count: %d/%d.' % (crash_count, round_number)) return False
def test_for_crash_with_retries(testcase, testcase_path, test_timeout, http_flag=False, compare_crash=True): """Test for a crash and return crash parameters like crash type, crash state, crash stacktrace, etc.""" # Cleanup any existing application instances and user profile directories. # Cleaning up temp clears user profile directories and should be done before # calling |get_command_line_for_application| call since that creates # dependencies in the profile folder. process_handler.terminate_stale_application_instances() shell.clear_temp_directory() app_directory = environment.get_value('APP_DIR') command = get_command_line_for_application(testcase_path, needs_http=http_flag) crash_retries = environment.get_value('CRASH_RETRIES') flaky_stacktrace = testcase.flaky_stack warmup_timeout = environment.get_value('WARMUP_TIMEOUT') logs.log('Testing for crash (command="%s").' % command) for round_number in xrange(1, crash_retries + 1): run_timeout = warmup_timeout if round_number == 1 else test_timeout return_code, crash_time, output = process_handler.run_process( command, timeout=run_timeout, gestures=testcase.gestures, current_working_directory=app_directory) process_handler.terminate_stale_application_instances() crash_result = CrashResult(return_code, crash_time, output) if not crash_result.is_crash(): continue state = crash_result.get_symbolized_data() logs.log('Crash occurred in %d seconds (round %d). State:\n%s' % (crash_time, round_number, state.crash_state)) if not compare_crash or not testcase.crash_state: logs.log('Crash stacktrace comparison skipped.') return crash_result if flaky_stacktrace: logs.log('Crash stacktrace is marked flaky, skipping comparison.') return crash_result if crash_result.should_ignore(): logs.log('Crash stacktrace matched ignore signatures, ignored.') continue if crash_result.is_security_issue() != testcase.security_flag: logs.log('Crash security flag does not match, ignored.') continue crash_comparer = CrashComparer(state.crash_state, testcase.crash_state) if crash_comparer.is_similar(): logs.log('Crash stacktrace is similar to original stacktrace.') return crash_result else: logs.log('Crash stacktrace does not match original stacktrace.') logs.log("Didn't crash at all.") crash_result = CrashResult(return_code=0, crash_time=0, output=output) return crash_result