def upload_testcases_if_needed(fuzzer_name, testcase_list, testcase_directory): """Upload test cases from the list to a cloud storage bucket.""" bucket_name = local_config.ProjectConfig().get( 'coverage.fuzzer-testcases.bucket') if not bucket_name: return # Only consider test cases in the output directory. We might upload too much # if we search the data directory as well, or have missing resources. # TODO(mbarbella): Support resources in data bundles. testcase_list = [ os.path.relpath(testcase, testcase_directory) for testcase in testcase_list if testcase.startswith(testcase_directory) ] if not testcase_list: return # Bail out if this batch of test cases is too large. directory_size = shell.get_directory_size(testcase_directory) if directory_size >= MAX_TESTCASE_DIRECTORY_SIZE: return formatted_date = str(utils.utcnow().date()) gcs_base_url = 'gs://{bucket_name}/{date}/{fuzzer_name}/'.format( bucket_name=bucket_name, date=formatted_date, fuzzer_name=fuzzer_name) runner = gsutil.GSUtilRunner() batch_directory_blobs = storage.list_blobs(gcs_base_url) total_testcases = 0 for blob in batch_directory_blobs: if not blob.endswith(LIST_FILE_BASENAME): continue list_gcs_url = 'gs://{bucket}/{blob}'.format(bucket=bucket_name, blob=blob) data = storage.read_data(list_gcs_url) if not data: logs.log_error( 'Read no data from test case list at {gcs_url}'.format( gcs_url=list_gcs_url)) continue total_testcases += len(data.splitlines()) # If we've already uploaded enough test cases for this fuzzer today, return. if total_testcases >= TESTCASES_PER_DAY: return # Upload each batch of tests to its own unique sub-bucket. identifier = environment.get_value('BOT_NAME') + str(utils.utcnow()) gcs_base_url += utils.string_hash(identifier) list_gcs_url = gcs_base_url + '/' + LIST_FILE_BASENAME if not storage.write_data('\n'.join(testcase_list), list_gcs_url): return runner.rsync(testcase_directory, gcs_base_url) logs.log('Synced {count} test cases to {gcs_url}'.format( count=len(testcase_list), gcs_url=gcs_base_url))
def test(self): """Test get_directory_size.""" self.fs.CreateFile('/test/aa/bb.txt', contents='abc') self.fs.CreateFile('/test/aa/cc.txt', contents='def') self.fs.CreateFile('/test/aa/aa/aa.txt', contents='ghi') self.fs.CreateFile('/test/aa/aa/dd.txt', contents='t') self.assertEqual(shell.get_directory_size('/test/aa'), 10)
def unpack_crash_testcases(crash_testcases_directory): """Unpacks the old crash testcases in the provided directory.""" for testcase in ndb_utils.get_all_from_model(data_types.Testcase): testcase_id = testcase.key.id() # 1. If we have already stored the testcase, then just skip. if testcase_id in STORED_TESTCASES_LIST: continue # 2. Make sure that it is a unique crash testcase. Ignore duplicates, # uploaded repros. if testcase.status != 'Processed': continue # 3. Check if the testcase is fixed. If not, skip. if testcase.open: continue # 4. Check if the testcase has a minimized repro. If not, skip. if not testcase.minimized_keys or testcase.minimized_keys == 'NA': continue # 5. Only use testcases that have bugs associated with them. if not testcase.bug_information: continue # 6. Existing IPC testcases are un-interesting and unused in furthur # mutations. Due to size bloat, ignoring these for now. if testcase.absolute_path.endswith(testcase_manager.IPCDUMP_EXTENSION): continue # 7. Ignore testcases that are archives (e.g. Langfuzz fuzzer tests). if archive.get_archive_type(testcase.absolute_path): continue # 8. Skip in-process fuzzer testcases, since these are only applicable to # fuzz targets and don't run with blackbox binaries. if testcase.fuzzer_name and testcase.fuzzer_name in [ 'afl', 'libFuzzer' ]: continue # Un-pack testcase. try: _, input_directory, _ = setup.unpack_testcase(testcase) except Exception: logs.log_error('Failed to unpack testcase %d.' % testcase.key.id()) continue # Move this to our crash testcases directory. crash_testcase_directory = os.path.join(crash_testcases_directory, str(testcase_id)) shell.move(input_directory, crash_testcase_directory) # Re-create input directory for unpacking testcase in next iteration. shell.create_directory(input_directory) STORED_TESTCASES_LIST.append(testcase_id) # Remove testcase directories that exceed the max size limit. for directory_name in os.listdir(crash_testcases_directory): directory_path = os.path.join(crash_testcases_directory, directory_name) if not os.path.isdir(directory_path): continue if shell.get_directory_size( directory_path) <= MAX_TESTCASE_DIRECTORY_SIZE: continue shell.remove_directory(directory_path) # Rename all fuzzed testcase files as regular files. for root, _, files in os.walk(crash_testcases_directory): for filename in files: if not filename.startswith(testcase_manager.FUZZ_PREFIX): continue file_path = os.path.join(root, filename) stripped_file_name = os.path.basename( file_path)[len(testcase_manager.FUZZ_PREFIX):] stripped_file_path = os.path.join(os.path.dirname(file_path), stripped_file_name) try: os.rename(file_path, stripped_file_path) except: raise Exception('Failed to rename testcase %s.' % file_path) # Remove empty files and dirs to avoid the case where a fuzzer randomly # chooses an empty dir/file and generates zero testcases. shell.remove_empty_files(crash_testcases_directory) shell.remove_empty_directories(crash_testcases_directory)
def do_corpus_pruning(context, last_execution_failed, revision): """Run corpus pruning.""" # Set |FUZZ_TARGET| environment variable to help with unarchiving only fuzz # target and its related files. environment.set_value("FUZZ_TARGET", context.fuzz_target.binary) if environment.is_trusted_host(): from bot.untrusted_runner import tasks_host return tasks_host.do_corpus_pruning(context, last_execution_failed, revision) build_manager.setup_build(revision=revision) build_directory = environment.get_value("BUILD_DIR") if not build_directory: raise CorpusPruningException("Failed to setup build.") start_time = datetime.datetime.utcnow() runner = Runner(build_directory, context) pruner = CorpusPruner(runner) fuzzer_binary_name = os.path.basename(runner.target_path) # If our last execution failed, shrink to a randomized corpus of usable size # to prevent corpus from growing unbounded and recurring failures when trying # to minimize it. if last_execution_failed: for corpus_url in [ context.corpus.get_gcs_url(), context.quarantine_corpus.get_gcs_url(), ]: _limit_corpus_size(corpus_url, CORPUS_SIZE_LIMIT_FOR_FAILURES) # Get initial corpus to process from GCS. context.sync_to_disk() initial_corpus_size = shell.get_directory_file_count( context.initial_corpus_path) # Restore a small batch of quarantined units back to corpus. context.restore_quarantined_units() # Shrink to a minimized corpus using corpus merge. pruner.run( context.initial_corpus_path, context.minimized_corpus_path, context.bad_units_path, ) # Sync minimized corpus back to GCS. context.sync_to_gcs() # Create corpus backup. backup_bucket = environment.get_value("BACKUP_BUCKET") corpus_backup_url = corpus_manager.backup_corpus( backup_bucket, context.corpus, context.minimized_corpus_path) minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) logs.log("Corpus pruned from %d to %d units." % (initial_corpus_size, minimized_corpus_size_units)) # Process bad units found during merge. # Mapping of crash state -> CorpusCrash crashes = {} pruner.process_bad_units(context.bad_units_path, context.quarantine_corpus_path, crashes) context.quarantine_corpus.rsync_from_disk(context.quarantine_corpus_path) # Store corpus stats into CoverageInformation entity. project_qualified_name = context.fuzz_target.project_qualified_name() today = datetime.datetime.utcnow().date() coverage_info = data_types.CoverageInformation( fuzzer=project_qualified_name, date=today) quarantine_corpus_size = shell.get_directory_file_count( context.quarantine_corpus_path) quarantine_corpus_dir_size = shell.get_directory_size( context.quarantine_corpus_path) # Populate coverage stats. coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes coverage_info.quarantine_size_units = quarantine_corpus_size coverage_info.quarantine_size_bytes = quarantine_corpus_dir_size coverage_info.corpus_backup_location = corpus_backup_url coverage_info.corpus_location = context.corpus.get_gcs_url() coverage_info.quarantine_location = context.quarantine_corpus.get_gcs_url() # Calculate remaining time to use for shared corpus merging. time_remaining = _get_time_remaining(start_time) if time_remaining <= 0: logs.log_warn("Not enough time for shared corpus merging.") return None cross_pollinator = CrossPollinator(runner) cross_pollinator.run(time_remaining) context.sync_to_gcs() # Update corpus size stats. minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes logs.log("Finished.") result = CorpusPruningResult( coverage_info=coverage_info, crashes=list(crashes.values()), fuzzer_binary_name=fuzzer_binary_name, revision=environment.get_value("APP_REVISION"), ) return result
def execute(input_directory, output_directory, fuzzer_name, generation_timeout): """Execute ML RNN generator to produce new inputs. This method should be called inside launcher, to generate a number of new inputs based on ML RNN model. It will fetch ML model from GCS bucket specified in environment variable `CORPUS_BUCKET`. The script to run the model resides in folder `tools/fuzzers/ml/rnn`. Args: input_directory: Seed corpus path. The directory should not be empty. output_directory: The directory to place generated inputs. fuzzer_name: Name of the fuzzer, e.g libpng_read_fuzzer. It indicates the subdirectory in gcs bucket to store models. generation_timeout: Time in seconds for the generator to run. Normally it takes <1s to generate an input, assuming the input length is <4KB. """ if environment.platform() != 'LINUX': logs.log('Unsupported platform for ML RNN generation, skipping.') return # Validate corpus folder. file_count = shell.get_directory_file_count(input_directory) if not file_count: logs.log('Corpus is empty. Skip generation.') return # Number of existing new inputs. They are possibly generated by other # generators. old_corpus_units = shell.get_directory_file_count(output_directory) old_corpus_bytes = shell.get_directory_size(output_directory) # Get model path. model_path = prepare_model_directory(fuzzer_name) if not model_path: return result = run(input_directory, output_directory, model_path, generation_timeout) # Generation process exited abnormally but not caused by timeout, meaning # error occurred during execution. if result.return_code and not result.timed_out: if result.return_code == constants.ExitCode.CORPUS_TOO_SMALL: logs.log_warn( 'ML RNN generation for fuzzer %s aborted due to small corpus.' % fuzzer_name) else: logs.log_error( 'ML RNN generation for fuzzer %s failed with ExitCode = %d.' % (fuzzer_name, result.return_code), output=result.output) return # Timeout is not error, if we have new units generated. if result.timed_out: logs.log_warn('ML RNN generation for fuzzer %s timed out.' % fuzzer_name) new_corpus_units = (shell.get_directory_file_count(output_directory) - old_corpus_units) new_corpus_bytes = (shell.get_directory_size(output_directory) - old_corpus_bytes) if new_corpus_units: logs.log( 'Added %d new inputs (%d bytes) using ML RNN generator for %s.' % (new_corpus_units, new_corpus_bytes, fuzzer_name)) else: logs.log_error('ML RNN generator did not produce any inputs for %s' % fuzzer_name, output=result.output)
def do_corpus_pruning(context, last_execution_failed, revision): """Run corpus pruning.""" # Set |FUZZ_TARGET| environment variable to help with unarchiving only fuzz # target and its related files. environment.set_value('FUZZ_TARGET', context.fuzz_target.binary) if environment.is_trusted_host(): from bot.untrusted_runner import tasks_host return tasks_host.do_corpus_pruning(context, last_execution_failed, revision) if not build_manager.setup_build(revision=revision): raise CorpusPruningException('Failed to setup build.') build_directory = environment.get_value('BUILD_DIR') start_time = datetime.datetime.utcnow() runner = Runner(build_directory, context) pruner = CorpusPruner(runner) fuzzer_binary_name = os.path.basename(runner.target_path) # If our last execution failed, shrink to a randomized corpus of usable size # to prevent corpus from growing unbounded and recurring failures when trying # to minimize it. if last_execution_failed: for corpus_url in [ context.corpus.get_gcs_url(), context.quarantine_corpus.get_gcs_url() ]: _limit_corpus_size(corpus_url) # Get initial corpus to process from GCS. context.sync_to_disk() initial_corpus_size = shell.get_directory_file_count( context.initial_corpus_path) # Restore a small batch of quarantined units back to corpus. context.restore_quarantined_units() # Shrink to a minimized corpus using corpus merge. pruner_stats = pruner.run(context.initial_corpus_path, context.minimized_corpus_path, context.bad_units_path) # Sync minimized corpus back to GCS. context.sync_to_gcs() # Create corpus backup. # Temporarily copy the past crash regressions folder into the minimized corpus # so that corpus backup archive can have both. regressions_input_dir = os.path.join(context.initial_corpus_path, 'regressions') regressions_output_dir = os.path.join(context.minimized_corpus_path, 'regressions') if shell.get_directory_file_count(regressions_input_dir): shutil.copytree(regressions_input_dir, regressions_output_dir) backup_bucket = environment.get_value('BACKUP_BUCKET') corpus_backup_url = corpus_manager.backup_corpus( backup_bucket, context.corpus, context.minimized_corpus_path) shell.remove_directory(regressions_output_dir) minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) logs.log('Corpus pruned from %d to %d units.' % (initial_corpus_size, minimized_corpus_size_units)) # Process bad units found during merge. # Mapping of crash state -> CorpusCrash crashes = {} pruner.process_bad_units(context.bad_units_path, context.quarantine_corpus_path, crashes) context.quarantine_corpus.rsync_from_disk(context.quarantine_corpus_path) # Store corpus stats into CoverageInformation entity. project_qualified_name = context.fuzz_target.project_qualified_name() today = datetime.datetime.utcnow().date() coverage_info = data_types.CoverageInformation( fuzzer=project_qualified_name, date=today) quarantine_corpus_size = shell.get_directory_file_count( context.quarantine_corpus_path) quarantine_corpus_dir_size = shell.get_directory_size( context.quarantine_corpus_path) # Save the minimize corpus size before cross pollination to put in BigQuery. pre_pollination_corpus_size = minimized_corpus_size_units # Populate coverage stats. coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes coverage_info.quarantine_size_units = quarantine_corpus_size coverage_info.quarantine_size_bytes = quarantine_corpus_dir_size coverage_info.corpus_backup_location = corpus_backup_url coverage_info.corpus_location = context.corpus.get_gcs_url() coverage_info.quarantine_location = context.quarantine_corpus.get_gcs_url() # Calculate remaining time to use for shared corpus merging. time_remaining = _get_time_remaining(start_time) if time_remaining <= 0: logs.log_warn('Not enough time for shared corpus merging.') return None cross_pollinator = CrossPollinator(runner) pollinator_stats = cross_pollinator.run(time_remaining) context.sync_to_gcs() # Update corpus size stats. minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes logs.log('Finished.') sources = ','.join([ fuzzer.fuzz_target.project_qualified_name() for fuzzer in context.cross_pollinate_fuzzers ]) cross_pollination_stats = None if pruner_stats and pollinator_stats: cross_pollination_stats = CrossPollinationStats( project_qualified_name, context.cross_pollination_method, sources, context.tag, initial_corpus_size, pre_pollination_corpus_size, pruner_stats['edge_coverage'], pollinator_stats['edge_coverage'], pruner_stats['feature_coverage'], pollinator_stats['feature_coverage']) return CorpusPruningResult(coverage_info=coverage_info, crashes=list(crashes.values()), fuzzer_binary_name=fuzzer_binary_name, revision=environment.get_value('APP_REVISION'), cross_pollination_stats=cross_pollination_stats)