def run_perf_tests(self): self.return_code |= run_performance_tests.main(self.args) self.interpret_run_benchmark_results(False) if len(self.result_recorder[False].failed_stories) > 0: # For failed stories we run_tests again to make sure it's not a false # positive. print('============ Re_run the failed tests ============') all_failed_stories = '(' + '|'.join( self.result_recorder[False].failed_stories) + ')' # TODO(crbug.com/1055893): Remove the extra chrome categories after # investigation of flakes in representative perf tests. self.re_run_args.extend([ '--story-filter', all_failed_stories, '--pageset-repeat=3', '--extra-chrome-categories=blink,blink_gc,gpu,v8,viz' ]) self.return_code |= run_performance_tests.main(self.re_run_args) self.interpret_run_benchmark_results(True) for story_name in self.result_recorder[False].failed_stories.copy( ): if story_name not in self.result_recorder[True].failed_stories: self.result_recorder[False].remove_failure( story_name, self.benchmark, self.is_control_story(story_name)) if self.result_recorder[False].is_control_stories_noisy: # In this case all failures are reported as expected, and the number of # Failed stories in output.json will be zero. self.result_recorder[False].invalidate_failures(self.benchmark) (finalOut, self.return_code) = self.result_recorder[False].get_output( self.return_code) with open(self.output_path[False], 'r+') as resultsFile: json.dump(finalOut, resultsFile, indent=4) with open(self.options.isolated_script_test_output, 'w') as outputFile: json.dump(finalOut, outputFile, indent=4) if self.result_recorder[False].is_control_stories_noisy: assert self.return_code == 0 print('Control story has high noise. These runs are not reliable!') return self.return_code
def main(): overall_return_code = 0 # Linux does not have it's own specific representatives # and uses the representatives chosen for windows. if sys.platform == 'win32' or sys.platform.startswith('linux'): platform = 'win' story_tag = 'representative_win_desktop' elif sys.platform == 'darwin': platform = 'mac' story_tag = 'representative_mac_desktop' else: return 1 options = parse_arguments() args = sys.argv re_run_args = sys.argv args.extend(['--story-tag-filter', story_tag]) overall_return_code = run_performance_tests.main(args) # The values used as the upper limit are the 99th percentile of the # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions. # If the value measured here would be higher than this value at least by # 2ms [AVG_ERROR_MARGIN], that would be considered a failure. # crbug.com/953895 with open( os.path.join( os.path.dirname(__file__), 'representative_perf_test_data', 'representatives_frame_times_upper_limit.json')) as bound_data: upper_limit_data = json.load(bound_data) out_dir_path = os.path.dirname(options.isolated_script_test_output) output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json') result_recorder = interpret_run_benchmark_results( upper_limit_data[platform], options.isolated_script_test_output) with open(output_path, 'r+') as resultsFile: if len(result_recorder.failed_stories) > 0: # For failed stories we run_tests again to make sure it's not a false # positive. print('============ Re_run the failed tests ============') all_failed_stories = '(' + '|'.join( result_recorder.failed_stories) + ')' re_run_args.extend( ['--story-filter', all_failed_stories, '--pageset-repeat=3']) re_run_isolated_script_test_dir = os.path.join( out_dir_path, 're_run_failures') re_run_isolated_script_test_output = os.path.join( re_run_isolated_script_test_dir, os.path.basename(options.isolated_script_test_output)) re_run_isolated_script_test_perf_output = os.path.join( re_run_isolated_script_test_dir, os.path.basename(options.isolated_script_test_perf_output)) re_run_args = replace_arg_values( re_run_args, [('--isolated-script-test-output', re_run_isolated_script_test_output), ('--isolated-script-test-perf-output', re_run_isolated_script_test_perf_output)]) overall_return_code |= run_performance_tests.main(re_run_args) re_run_result_recorder = interpret_run_benchmark_results( upper_limit_data[platform], re_run_isolated_script_test_output) for story_name in result_recorder.failed_stories.copy(): if story_name not in re_run_result_recorder.failed_stories: result_recorder.remove_failure(story_name) (finalOut, overall_return_code) = result_recorder.get_output(overall_return_code) json.dump(finalOut, resultsFile, indent=4) with open(options.isolated_script_test_output, 'w') as outputFile: json.dump(finalOut, outputFile, indent=4) return overall_return_code
def main(): overall_return_code = 0 # Linux does not have it's own specific representatives # and uses the representatives chosen for winodws. if sys.platform == 'win32': platform = 'win' story_tag = 'representative_win_desktop' elif sys.platform == 'darwin': platform = 'mac' story_tag = 'representative_mac_desktop' else: return 1 options = parse_arguments() args = sys.argv args.extend(['--story-tag-filter', story_tag]) overall_return_code = run_performance_tests.main(args) result_recorder = ResultRecorder() # The values used as the upper limit are the 99th percentile of the # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions. # If the value measured here would be higher than this value at least by # 2ms [AVG_ERROR_MARGIN], that would be considered a failure. # crbug.com/953895 with open( os.path.join( os.path.dirname(__file__), 'representative_perf_test_data', 'representatives_frame_times_upper_limit.json')) as bound_data: upper_limit_data = json.load(bound_data) out_dir_path = os.path.dirname(options.isolated_script_test_output) test_count = len(upper_limit_data[platform]) output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json') with open(output_path, 'r+') as resultsFile: initialOut = json.load(resultsFile) result_recorder.setTests(initialOut, test_count) results_path = os.path.join(out_dir_path, BENCHMARK, 'perf_results.csv') marked_stories = set() with open(results_path) as csv_file: reader = csv.DictReader(csv_file) for row in reader: # For now only frame_times is used for testing representatives' # performance. if row['name'] != 'frame_times': continue story_name = row['stories'] if (story_name in marked_stories or story_name not in upper_limit_data[platform]): continue marked_stories.add(story_name) if row['avg'] == '' or row['count'] == 0: print "No values for " + story_name result_recorder.addFailure(story_name) elif (float(row['ci_095']) > upper_limit_data[platform][story_name]['ci_095'] * CI_ERROR_MARGIN): print "Noisy data on frame_times for " + story_name + ".\n" result_recorder.addFailure(story_name) elif (float(row['avg']) > upper_limit_data[platform][story_name]['avg'] + AVG_ERROR_MARGIN): print(story_name + ": average frame_times is higher than 99th " + "percentile of the past 200 recorded frame_times(" + row['avg'] + ")" + ".\n") result_recorder.addFailure(story_name) (finalOut, overall_return_code) = result_recorder.getOutput(overall_return_code) # Clearing the result of run_benchmark and write the gated perf results resultsFile.seek(0) resultsFile.truncate(0) json.dump(finalOut, resultsFile, indent=4) with open(options.isolated_script_test_output, 'w') as outputFile: json.dump(finalOut, outputFile, indent=4) return overall_return_code
def main(): overall_return_code = 0 options = parse_arguments() print(options) if options.benchmarks == 'rendering.desktop': # Linux does not have it's own specific representatives # and uses the representatives chosen for windows. if sys.platform == 'win32' or sys.platform.startswith('linux'): platform = 'win' story_tag = 'representative_win_desktop' elif sys.platform == 'darwin': platform = 'mac' story_tag = 'representative_mac_desktop' else: return 1 elif options.benchmarks == 'rendering.mobile': platform = 'android' story_tag = 'representative_mobile' else: return 1 benchmark = options.benchmarks args = sys.argv re_run_args = sys.argv args.extend(['--story-tag-filter', story_tag]) overall_return_code = run_performance_tests.main(args) # The values used as the upper limit are the 99th percentile of the # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions. # If the value measured here would be higher than this value at least by # 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be considered a # failure. crbug.com/953895 with open( os.path.join( os.path.dirname(__file__), 'representative_perf_test_data', 'representatives_frame_times_upper_limit.json')) as bound_data: upper_limit_data = json.load(bound_data) out_dir_path = os.path.dirname(options.isolated_script_test_output) output_path = os.path.join(out_dir_path, benchmark, 'test_results.json') result_recorder = interpret_run_benchmark_results( upper_limit_data[platform], options.isolated_script_test_output, benchmark) with open(output_path, 'r+') as resultsFile: if len(result_recorder.failed_stories) > 0: # For failed stories we run_tests again to make sure it's not a false # positive. print('============ Re_run the failed tests ============') all_failed_stories = '(' + '|'.join( result_recorder.failed_stories) + ')' re_run_args.extend( ['--story-filter', all_failed_stories, '--pageset-repeat=3']) re_run_isolated_script_test_dir = os.path.join( out_dir_path, 're_run_failures') re_run_isolated_script_test_output = os.path.join( re_run_isolated_script_test_dir, os.path.basename(options.isolated_script_test_output)) re_run_isolated_script_test_perf_output = os.path.join( re_run_isolated_script_test_dir, os.path.basename(options.isolated_script_test_perf_output)) re_run_args = replace_arg_values( re_run_args, [('--isolated-script-test-output', re_run_isolated_script_test_output), ('--isolated-script-test-perf-output', re_run_isolated_script_test_perf_output)]) overall_return_code |= run_performance_tests.main(re_run_args) re_run_result_recorder = interpret_run_benchmark_results( upper_limit_data[platform], re_run_isolated_script_test_output, benchmark) for story_name in result_recorder.failed_stories.copy(): if story_name not in re_run_result_recorder.failed_stories: result_recorder.remove_failure( story_name, benchmark, is_control_story( upper_limit_data[platform][story_name])) if result_recorder.is_control_stories_noisy: # In this case all failures are reported as expected, and the number of # Failed stories in output.json will be zero. result_recorder.invalidate_failures(benchmark) (finalOut, overall_return_code) = result_recorder.get_output(overall_return_code) json.dump(finalOut, resultsFile, indent=4) with open(options.isolated_script_test_output, 'w') as outputFile: json.dump(finalOut, outputFile, indent=4) if result_recorder.is_control_stories_noisy: assert overall_return_code == 0 print('Control story has high noise. These runs are not reliable!') return overall_return_code