def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.PerformanceOnly sut = mlperf_loadgen.ConstructSUT( issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL( 1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.Offline settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.offline_expected_qps = 1000 sut = mlperf_loadgen.ConstructSUT( issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL( 1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.MultiStream settings.mode = mlperf_loadgen.TestMode.SubmissionRun settings.samples_per_query = 4 settings.target_qps = 1000 settings.target_latency_ns = 1000000000 sut = mlperf_loadgen.ConstructSUT(issue_query) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.AccuracyOnly settings.single_stream_expected_latency_ns = 1000000 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.Server settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.server_target_qps = 100 settings.server_target_latency_ns = 100000000 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.single_stream_expected_latency_ns = 1000000 settings.enable_spec_overrides = True settings.override_target_latency_ns = 100000000 settings.override_min_query_count = 100 settings.override_min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): del argv settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.MultiStreamFree settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.multi_stream_target_latency_ns = 100000000 settings.multi_stream_samples_per_query = 4 settings.multi_stream_max_async_queries = 2 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format() # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup warmup_queries = range(args.max_batchsize) ds.load_query_samples(warmup_queries) for _ in range(2): img, _ = ds.get_samples(warmup_queries) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if count_override: settings.min_query_count = count settings.max_query_count = count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) # override target latency when it needs to be less than 1ms if args.model_name == "mobilenet": settings.single_stream_expected_latency_ns = 200000 elif args.model_name == "resnet50": settings.single_stream_expected_latency_ns = 900000 elif args.model_name == "ssd-mobilenet": settings.single_stream_expected_latency_ns = 1000000 sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) #qsl = lg.ConstructQSL(count, min(count, 500), ds.load_query_samples, ds.unload_query_samples) qsl = lg.ConstructQSL(count, min(count, 1024), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) if args.enable_trace: lg.StartTest(sut, qsl, settings) else: logsettings = lg.LogSettings() logsettings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, settings, logsettings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count = args.count if not count: if not args.accuracy: count = 200 # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) for scenario in args.scenario: runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if scenario == lg.TestScenario.SingleStream: settings.min_query_count = args.queries_single settings.max_query_count = args.queries_single elif scenario == lg.TestScenario.MultiStream: settings.min_query_count = args.queries_multi settings.max_query_count = args.queries_multi settings.multi_stream_samples_per_query = 4 elif scenario == lg.TestScenario.Server: max_latency = args.max_latency elif scenario == lg.TestScenario.Offline: settings.min_query_count = args.queries_offline settings.max_query_count = args.queries_offline sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1000), ds.load_query_samples, ds.unload_query_samples) if scenario == lg.TestScenario.Server: for target_latency in max_latency: log.info("starting {}, latency={}".format( scenario, target_latency)) settings.server_target_latency_ns = int(target_latency * NANO_SEC) result_dict = { "good": 0, "total": 0, "scenario": str(scenario) } runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) else: log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC #settings.max_duration_ms = args.time * MILLI_SEC if count_override: settings.min_query_count = count # settings.max_query_count = count if args.min_query_count: settings.min_query_count = args.min_query_count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.single_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) def set_qps(current_qps): settings.server_target_qps = current_qps settings.offline_expected_qps = current_qps settings.multi_stream_target_qps = current_qps return current_qps if args.qps: qps = set_qps(args.qps) lower_qps = -1 upper_qps = -1 qps_passed = {} while True: print("schedual qps:", qps) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 500), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) print("max query count:", settings.max_query_count) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) took = time.time() - ds.last_loaded add_results(final_results, "{}".format(scenario), result_dict, last_timeing, took, args.accuracy) lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) if args.scenario != 'Server': break if args.auto_qps is False: break if lower_qps == -1 or upper_qps == -1: base_qps = len(last_timeing) / took upper_qps = base_qps * 1.5 lower_qps = base_qps * 0.5 qps = set_qps(lower_qps) continue latency_percentile_ns = np.percentile( last_timeing, settings.server_target_latency_percentile * 100) * NANO_SEC if latency_percentile_ns < settings.server_target_latency_ns and qps > upper_qps * 0.98: print("target qps:", qps) break if upper_qps - lower_qps < 1 and lower_qps in qps_passed: print("target qps:", lower_qps) break if latency_percentile_ns > settings.server_target_latency_ns: #reduce qps print("reduce qps, bound:[%d, %d]" % (lower_qps, upper_qps)) upper_qps = qps if qps == lower_qps: lower_qps = lower_qps * 0.5 qps = set_qps(lower_qps) continue if qps > lower_qps: qps = set_qps((lower_qps + upper_qps) / 2) continue if latency_percentile_ns < settings.server_target_latency_ns: #increase qps qps_passed[qps] = None print("increase qps, bound:[%d, %d]" % (lower_qps, upper_qps)) lower_qps = qps if qps == upper_qps: upper_qps = upper_qps * 1.5 qps = set_qps(upper_qps) continue if qps < upper_qps: qps = set_qps((lower_qps + upper_qps) / 2) continue runner.finish()
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) if getattr(backend, "max_batchsize", -1) != -1: backend.max_batchsize = args.max_batchsize # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) audit_config_cp_loc = None if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) # Check if audit.config file is used, copy to output directory before # we chdir to that location so loadgen can find it audit_files = glob.glob( "ncoresw/mlperf/vision/classification_and_detection/*audit.config") if len(audit_files): log.info("Found audit.config (" + audit_files[0] + ")") audit_config_cp_loc = os.path.join(output_dir, "audit.config") # If user already put audit.config at `output` directory, then use # that one. Otherwise, copy the one we found in the current # directory (before chdir to new output directory). if os.path.exists(audit_config_cp_loc): log.info( "WARNING: audit.config already exists, so cannot copy over new audit file!" ) log.info(audit_config_cp_loc) audit_config_cp_loc = None else: shutil.copy(audit_files[0], audit_config_cp_loc) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup warmup_queries = range(args.max_batchsize) ds.load_query_samples(warmup_queries) for _ in range(2): img, _ = ds.get_samples(warmup_queries) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if count_override: settings.min_query_count = count settings.max_query_count = count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) # override target latency when it needs to be less than 1ms if args.model_name == "mobilenet": settings.single_stream_expected_latency_ns = 200000 elif args.model_name == "resnet50": settings.single_stream_expected_latency_ns = 900000 elif args.model_name == "ssd-mobilenet": settings.single_stream_expected_latency_ns = 900000 sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1024), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # Dump the summary logs to stdout for convenience log.info("Output dir: " + os.path.abspath(output_dir)) with open(os.path.join(output_dir, "mlperf_log_summary.txt"), 'r') as f: log.info(f.read()) # Output accuracy txt file if args.accuracy: with open(os.path.join(output_dir, "accuracy.txt"), "w") as f_acc: # SSD accuracy calculation #---------------------------------------- # The mAP is already stored in result_dict["mAP"], but we'll call # `accuracy_coco()` just to keep the submission process consistent. if args.model_name == "ssd-mobilenet": accuracy_str = accuracy.CocoAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), coco_dir=args.dataset_path).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) if args.model_name == "ssd-resnet34": accuracy_str = accuracy.CocoAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), coco_dir=args.dataset_path, use_inv_map=True, remove_48_empty_images=False).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) # ImageNet accuracy calculation #---------------------------------------- # The good / total values are already stored in result_dict["good"] # and result_dict["total"], but we'll call `accuracy_imagenet()` # just to keep the submission process consistent. else: accuracy_str = accuracy.ImagenetAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), imagenet_val_file=os.path.join( args.dataset_path, "val_map.txt")).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) if audit_config_cp_loc != None: os.remove(audit_config_cp_loc) backend_destroy = getattr(backend, "destroy", None) if callable(backend_destroy): backend.destroy()
def main(): global last_timeing args = get_args() log.info(args) backend = BackendTensorRT() ds = Imagenet( data_path=args.dataset_path, use_cache=args.cache, batch_size=args.batch_size, image_size=args.image_size, calib_file='cal_image_list_option_%d.txt' % args.calib_file) model = backend.load(args, ds=ds) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) assert(os.path.exists(config)), "%s not existed!" % config user_config = os.path.abspath(args.user_config) assert(os.path.exists(user_config)), "%s not existed!" % user_config base_path = os.path.dirname(os.path.realpath(__file__)) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) post_proc = PostProcessCommon(offset=0) runner = QueueRunner( model, ds, args.threads, post_proc=post_proc, batch_size=args.batch_size) def issue_queries(ids, indices): runner.enqueue(ids, indices) def flush_queries(): pass def process_latencies(latencies_ns): global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() model_name = 'OFAnet-AutoSinian' settings.FromConfig(config, model_name, args.scenario) settings.FromConfig(user_config, model_name, args.scenario) if args.audit_test: audit_config_path = base_path + '/audit%s.config' % args.audit_test settings.FromConfig(audit_config_path, model_name, args.scenario) scenario = SCENARIO_MAP[args.scenario] settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(ds.get_item_count(), args.batch_size, ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) start = time.time() lg.StartTest(sut, qsl, settings) post_proc.finalize(result_dict) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, runner.finishTime - ds.last_loaded, args) runner.finish() lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=2)
.format(self.count, qitem.sample_id[0])) self.count += 1 return self.count if __name__ == "__main__": runner = DummyRunner() runner.start_worker() settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.PerformanceOnly # Specify exactly how many queries need to be made settings.min_query_count = 3003 settings.max_query_count = 3003 total_queries = 256 # Maximum sample ID + 1 perf_queries = 8 # TBD: Doesn't seem to have an effect sut = mlperf_loadgen.ConstructSUT(runner.enqueue, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(total_queries, perf_queries, runner.load_samples_to_ram, runner.unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(): args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=args.count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = args.count if args.count else ds.get_item_count() runner = Runner(model, ds, args.threads, post_proc=post_proc) runner.start_pool() # warmup log.info("warmup ...") ds.load_query_samples([0]) for _ in range(50): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) def issue_query(query_samples): idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] data, label = ds.get_samples(idx) runner.enqueue(query_id, idx, data, label) def process_latencies(latencies_ns): global last_timeing last_timeing = [t / 10000000. for t in latencies_ns] sut = lg.ConstructSUT(issue_query, process_latencies) qsl = lg.ConstructQSL(count, count, ds.load_query_samples, ds.unload_query_samples) scenarios = [ lg.TestScenario.SingleStream, lg.TestScenario.MultiStream, lg.TestScenario.Server, # lg.TestScenario.Offline, ] for scenario in scenarios: for target_latency in args.max_latency: log.info("starting {}, latency={}".format(scenario, target_latency)) settings = lg.TestSettings() settings.scenario = scenario if args.qps: settings.enable_spec_overrides = True qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if args.time: settings.enable_spec_overrides = True settings.override_min_duration_ms = args.time * MILLI_SEC settings.override_max_duration_ms = args.time * MILLI_SEC qps = args.qps or 100 settings.override_min_query_count = qps * args.time settings.override_max_query_count = qps * args.time if args.time or args.qps: settings.mode = lg.TestMode.PerformanceOnly # FIXME: add SubmissionRun once available settings.enable_spec_overrides = True # FIXME: needed because of override_target_latency_ns settings.single_stream_expected_latency_ns = int(target_latency * NANO_SEC) settings.override_target_latency_ns = int(target_latency * NANO_SEC) # reset result capture result_dict = {"good": 0, "total": 0} runner.start_run(result_dict, True) start = time.time() lg.StartTest(sut, qsl, settings) # aggregate results post_proc.finalize(result_dict, ds) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timeing, time.time() - start) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=args.count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = args.count if args.count else ds.get_item_count() runner = Runner(model, ds, args.threads, post_proc=post_proc) # # warmup # log.info("warmup ...") ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) if args.accuracy: # # accuracy pass # log.info("starting accuracy pass on {} items".format(count)) runner.start_pool(nolg=True) result_dict = { "good": 0, "total": 0, "scenario": "Accuracy", "timing": [] } runner.start_run(result_dict, True) start = time.time() for idx in range(0, count): ds.load_query_samples([idx]) data, label = ds.get_samples([idx]) runner.enqueue([idx], [idx], data, label) runner.finish() # aggregate results post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) last_timeing = result_dict["timing"] del result_dict["timing"] add_results(final_results, "Accuracy", result_dict, last_timeing, time.time() - start) # # run the benchmark with timing # runner.start_pool() def issue_query(query_samples): idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] data, label = ds.get_samples(idx) runner.enqueue(query_id, idx, data, label) def process_latencies(latencies_ns): global last_timeing last_timeing = [t / 1e9 for t in latencies_ns] sut = lg.ConstructSUT(issue_query, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1000), ds.load_query_samples, ds.unload_query_samples) for scenario in args.scenario: for target_latency in args.max_latency: log.info("starting {}, latency={}".format(scenario, target_latency)) settings = lg.TestSettings() log.info(scenario) if str(scenario) == 'TestMode.AccuracyOnly': settings.mode = scenario else: settings.scenario = scenario if args.qps: settings.enable_spec_overrides = True qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if args.time: settings.enable_spec_overrides = True settings.override_min_duration_ms = args.time * MILLI_SEC settings.override_max_duration_ms = args.time * MILLI_SEC qps = args.qps or 100 settings.override_min_query_count = qps * args.time settings.override_max_query_count = qps * args.time if args.time or args.qps and str( scenario) != 'TestMode.AccuracyOnly': settings.mode = lg.TestMode.PerformanceOnly # FIXME: add SubmissionRun once available settings.enable_spec_overrides = True settings.single_stream_expected_latency_ns = int(target_latency * NANO_SEC) settings.override_target_latency_ns = int(target_latency * NANO_SEC) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, False) lg.StartTest(sut, qsl, settings) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timeing, time.time() - ds.last_loaded) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def main(): global qcount global num_sockets global cpus_per_socket global cpus_per_process global cpus_per_instance global total_instances global start_time global item_total global last_timeing args = get_args() log.info(args) config = os.path.abspath(args.config) user_config = os.path.abspath(args.user_config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if not os.path.exists(user_config): log.error("{} not found".format(user_config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) if os.path.exists("./audit.config"): copyfile("./audit.config", output_dir + "/audit.config") os.chdir(output_dir) settings = lg.TestSettings() settings.FromConfig(config, args.model, args.scenario) settings.FromConfig(user_config, args.model, args.scenario) settings.mode = lg.TestMode.PerformanceOnly cpus_for_loadgen = 1 left_cores = cpus_per_socket * num_sockets - total_procs * cpus_per_process first_instance_start_core = cpus_for_loadgen if left_cores > cpus_for_loadgen: first_instance_start_core = 0 cpus_for_loadgen = left_cores total_instances = 0 instances_per_proc = (cpus_per_process // cpus_per_instance) for i in range(total_procs): if i == 0 and first_instance_start_core > 0: total_instances = total_instances + ((cpus_per_process - first_instance_start_core) // cpus_per_instance) if (cpus_per_instance - first_instance_start_core) >= (cpus_per_instance // 2): total_instances = total_instances + 1 else: total_instances = total_instances + instances_per_proc #print("Setup {} Instances !!".format(total_instances)) lock = multiprocessing.Lock() barrier = multiprocessing.Barrier(total_instances) init_counter = multiprocessing.Value("i", 0) total_samples = multiprocessing.Value("i", 0) finished_samples = multiprocessing.Value("i", 0) dsQueue = multiprocessing.Queue() numOutQ = num_sockets outQueues = [multiprocessing.Queue() for i in range(numOutQ)] #inQueue = multiprocessing.JoinableQueue() inQueue = multiprocessing.Queue() consumers = [Consumer(inQueue, outQueues[i%numOutQ], dsQueue, lock, init_counter, finished_samples, barrier, i, args, settings.min_query_count, first_instance_start_core) for i in range(total_procs)] for c in consumers: c.start() # Wait until subprocess ready while init_counter.value < total_procs: time.sleep(2) import torch import criteo torch.set_num_threads(cpus_per_socket * num_sockets) dlrm_dataset = get_dataset(args) total_samples.value = dlrm_dataset.get_item_count() scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } settings.scenario = scenario runner = runner_map[scenario](inQueue, dlrm_dataset, total_samples.value, args.max_sample_size, args.max_batchsize) # Start response thread response_workers = [threading.Thread( target=response_loadgen, args=(outQueues[i], args.accuracy)) for i in range(numOutQ)] for response_worker in response_workers: response_worker.daemon = True response_worker.start() def issue_queries(response_ids, query_sample_indexes): runner.enqueue(response_ids, query_sample_indexes) def flush_queries(): runner.unload_query_samples() def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.performance_sample_count_override = total_samples.value if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) if args.accuracy: qcount = total_samples.value else: qcount = settings.min_query_count def load_query_samples(sample_list): # Wait until subprocess ready global start_time global total_instances for _ in range(total_instances): dsQueue.put(sample_list) while init_counter.value < total_procs + total_instances: time.sleep(2) start_time = time.time() def unload_query_samples(sample_list): pass sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)} torch.set_num_threads(cpus_for_loadgen) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = item_timing if args.accuracy: result_dict["good"] = item_good result_dict["total"] = item_total result_dict["roc_auc"] = criteo.auc_score(item_results) final_results = { "runtime": "pytorch-native-dlrm", "version": torch.__version__, "time": int(time.time()), "cmdline": str(args), } add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - start_time, args.accuracy) #inQueue.join() for _ in range(total_instances): inQueue.put(None) for c in consumers: c.join() for i in range(numOutQ): outQueues[i].put(None) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) # write final results if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format() # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=args.count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = args.count if args.count else ds.get_item_count() if args.accuracy: # # accuracy pass # log.info("starting accuracy pass on {} items".format(count)) last_timeing = [] runner = RunnerBase(model, ds, args.threads, post_proc=post_proc) result_dict = {"good": 0, "total": 0, "scenario": "Accuracy"} runner.start_run(result_dict, True) start = time.time() for idx in range(0, count): ds.load_query_samples([idx]) data, label = ds.get_samples([idx]) start_one = time.time() runner.enqueue([idx], [idx], data, label) last_timeing.append(time.time() - start_one) runner.finish() # aggregate results post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) add_results(final_results, "Accuracy", result_dict, last_timeing, time.time() - start) # warmup ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) for scenario in args.scenario: runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc) def issue_query(query_samples): # called by loadgen to issue queries idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] data, label = ds.get_samples(idx) runner.enqueue(query_id, idx, data, label) def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / 1e9 for t in latencies_ns] settings = lg.TestSettings() settings.enable_spec_overrides = True settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly settings.multi_stream_samples_per_query = 8 if args.time: # override the time we want to run settings.enable_spec_overrides = True settings.override_min_duration_ms = args.time * MILLI_SEC settings.override_max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps # mlperf rules - min queries if scenario == lg.TestScenario.SingleStream: settings.override_min_query_count = args.queries_single settings.override_max_query_count = args.queries_single else: settings.override_min_query_count = args.queries_multi settings.override_max_query_count = args.queries_multi sut = lg.ConstructSUT(issue_query, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1000), ds.load_query_samples, ds.unload_query_samples) for target_latency in args.max_latency: log.info("starting {}, latency={}".format(scenario, target_latency)) settings.single_stream_expected_latency_ns = int(target_latency * NANO_SEC) settings.override_target_latency_ns = int(target_latency * NANO_SEC) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, False) lg.StartTest(sut, qsl, settings) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timeing, time.time() - ds.last_loaded) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): args = get_args() print(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # dataset to use wanted_dataset, preprocessor, postprocessor, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=preprocessor, use_cache=args.cache, count=args.count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = args.count if args.count else ds.get_item_count() runner = Runner(model, ds, args.threads, post_process=postprocessor) runner.start_pool() # warmup log.info("warmup ...") ds.load_query_samples([0]) for _ in range(100): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) def issue_query(query_samples): idx = [q.index for q in query_samples] query_id = [q.id for q in query_samples] data, label = ds.get_samples(idx) runner.enqueue(query_id, data, label) sut = lg.ConstructSUT(issue_query) qsl = lg.ConstructQSL(count, args.time, ds.load_query_samples, ds.unload_query_samples) scenarios = [ # lg.TestScenario.SingleStream, lg.TestScenario.MultiStream, # lg.TestScenario.Cloud, # lg.TestScenario.Offline, ] for scenario in scenarios: for target_latency in args.max_latency: log.info("starting {}, latency={}".format(scenario, target_latency)) settings = lg.TestSettings() settings.scenario = scenario settings.mode = lg.TestMode.SubmissionRun settings.samples_per_query = 4 # FIXME: we don't want to know about this settings.target_qps = 1000 # FIXME: we don't want to know about this settings.target_latency_ns = int(target_latency * 1000000000) result_list = [] result_dict = {"good": 0, "total": 0} runner.start_run(result_list, result_dict) start = time.time() lg.StartTest(sut, qsl, settings) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, result_list, time.time() - start) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend, args.dataset, args.max_ind_range, args.data_sub_sample_rate, args.use_gpu) # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] # --count-samples can be used to limit the number of samples used for testing ds = wanted_dataset(data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function use_cache=args.cache, # currently not used count=args.count_samples, samples_to_aggregate_fix=args.samples_to_aggregate_fix, samples_to_aggregate_min=args.samples_to_aggregate_min, samples_to_aggregate_max=args.samples_to_aggregate_max, samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file, samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file, test_num_workers=args.test_num_workers, max_ind_range=args.max_ind_range, sub_sample_rate=args.data_sub_sample_rate, mlperf_bin_loader=args.mlperf_bin_loader, **kwargs) # load model to backend model = backend.load(args.model_path, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): batch_dense_X, batch_lS_o, batch_lS_i, _, _ = ds.get_samples([0]) _ = backend.predict(batch_dense_X, batch_lS_o, batch_lS_i) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(mlperf_conf, args.model_path, args.scenario) settings.FromConfig(user_conf, args.model_path, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global num_sockets global start_time global item_total global last_timeing args = get_args() log.info(args) config = os.path.abspath(args.config) user_config = os.path.abspath(args.user_config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if not os.path.exists(user_config): log.error("{} not found".format(user_config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) total_samples = multiprocessing.Value("i", 0) dsQueue = multiprocessing.Queue() outQueue = multiprocessing.Queue() inQueue = multiprocessing.JoinableQueue(num_sockets * 4) consumers = [ Consumer(inQueue, outQueue, dsQueue, lock, init_counter, total_samples, i, args) for i in range(num_sockets) ] for c in consumers: c.start() # Wait until subprocess ready while init_counter.value < num_sockets: time.sleep(2) # Start response thread response_worker = threading.Thread(target=response_loadgen, args=(outQueue, args.accuracy)) response_worker.daemon = True response_worker.start() scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](inQueue, max_batchsize=args.max_batchsize) def issue_queries(response_ids, query_sample_indexes): runner.enqueue(response_ids, query_sample_indexes) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model, args.scenario) settings.FromConfig(user_config, args.model, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.performance_sample_count_override = total_samples.value if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) def load_query_samples(sample_list): # Wait until subprocess ready global start_time for _ in range(num_sockets): dsQueue.put(sample_list) while init_counter.value < 2 * num_sockets: time.sleep(2) start_time = time.time() def unload_query_samples(sample_list): pass import torch import criteo sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL( total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) result_dict = { "good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario) } lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = item_timing if args.accuracy: result_dict["good"] = item_good result_dict["total"] = item_total result_dict["roc_auc"] = criteo.auc_score(item_results) final_results = { "runtime": "pytorch-native-dlrm", "version": torch.__version__, "time": int(time.time()), "cmdline": str(args), } add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - start_time, args.accuracy) inQueue.join() for _ in consumers: inQueue.put(None) for c in consumers: c.join() outQueue.put(None) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) # write final results if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global num_ins global num_phy_cpus global in_queue_cnt global out_queue_cnt args = get_args() log.info(args) num_ins = args.num_instance num_phy_cpus = args.num_phy_cpus log.info('Run with {} instance on {} cpus'.format(num_ins, num_phy_cpus)) mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) image_format = 'NCHW' dataset = "imagenet" wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, cache_dir=args.cache_dir, count=args.count, use_int8=args.use_int8_dataset, num_workers=num_phy_cpus, **kwargs) # Establish communication queues log.info('Start comsumer queue and response thread') lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) in_queue = multiprocessing.JoinableQueue() out_queue = multiprocessing.Queue() ds_queue = multiprocessing.Queue() # Start consumers consumers = [Consumer(in_queue, out_queue, ds_queue, lock, init_counter, i, args) for i in range(num_ins)] for c in consumers: c.start() # Wait until all sub-processors are ready block_until(init_counter, num_ins, 2) # Start response thread response_worker = threading.Thread( target=response_loadgen, args=(out_queue,)) response_worker.daemon = True response_worker.start() scenario = SCENARIO_MAP[args.scenario] runner = QueueRunner(in_queue, args.batch_size) def issue_queries(response_ids, query_sample_indexes): runner.put(response_ids, query_sample_indexes) def flush_queries(): pass def process_latencies(latencies_ns): log.info("Average latency: {}".format(np.mean(latencies_ns))) log.info("Median latency: {}".format(np.percentile(latencies_ns, 50))) log.info("90 percentile latency: {}".format(np.percentile(latencies_ns, 90))) def load_query_samples(sample_list): for _ in range(num_ins): ds_queue.put(sample_list) block_until(init_counter, 2 * num_ins, 2) def unload_query_samples(sample_list): pass settings = lg.TestSettings() settings.FromConfig(mlperf_conf, "resnet50", args.scenario) settings.FromConfig(user_conf, "resnet50", args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps count = ds.get_item_count() perf_count = 1024 if args.accuracy: perf_count = count sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, perf_count, load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) lg.StartTest(sut, qsl, settings) # Wait until outQueue done while out_queue_cnt < in_queue_cnt: time.sleep(0.2) in_queue.join() for i in range(num_ins): in_queue.put('DONE') for c in consumers: c.join() out_queue.put('DONE') if args.accuracy: output_file = 'accuracy.txt' if args.output_file: output_file = args.output_file cmd = "python tools/accuracy-imagenet.py " \ "--mlperf-accuracy-file=mlperf_log_accuracy.json " \ "--imagenet-val-file=val_map.txt --output-file={}".format(output_file) cmd = cmd.split(' ') subprocess.check_call(cmd) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) log.info('Test done.')