def __init__(self, mxnet_vocab=None, perf_count=None, logger=None): self.logger = logger if self.logger: self.logger.info("Constructing QSL...") test_batch_size = 1 eval_features = [] if self.logger: self.logger.info("Creating tokenizer...") with open(mxnet_vocab, 'r') as f: vocab = nlp.vocab.BERTVocab.from_json(f.read()) tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=True) round_to = None if self.logger: self.logger.info("Reading examples...") dev_path = os.path.join(os.getcwd(), 'build/data') dev_data = SQuAD('dev', version='1.1', root=dev_path) dev_data_transform = preprocess_dataset( tokenizer, dev_data, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, input_features=True) self.eval_features = dev_data_transform self.count = len(self.eval_features) self.perf_count = perf_count if perf_count is not None else self.count self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) if self.logger: self.logger.info("Finished constructing QSL.")
def benchmark_using_loadgen(scenario_str, mode_str, samples_in_mem, config_filepath): "Perform the benchmark using python API for the LoadGen librar" scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[scenario_str] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[mode_str] ts = lg.TestSettings() if (config_filepath): ts.FromConfig(config_filepath, 'random_model_name', scenario_str) ts.scenario = scenario ts.mode = mode sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(dataset_size, samples_in_mem, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def __init__(self, perf_count=None): print("Creating tokenizer...") tokenizer = BertTokenizer("build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt") print("Reading examples...") eval_examples = read_squad_examples(input_file="build/data/dev-v1.1.json", is_training=False, version_2_with_negative=False) print("Converting examples to features...") eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Constructing QSL...") self.eval_features = eval_features self.count = len(self.eval_features) self.perf_count = perf_count if perf_count is not None else self.count self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
def __init__(self, preprocessed_data_dir, perf_count): """ Constructs all the necessary attributes for QSL Parameters ---------- preprocessed_data_dir: str or PosixPath path to directory containing preprocessed data perf_count: int number of query samples guaranteed to fit in memory """ print("Constructing QSL...") self.preprocessed_data_dir = preprocessed_data_dir with open(Path(self.preprocessed_data_dir, "preprocessed_files.pkl"), "rb") as f: self.preprocess_files = pickle.load(f)['file_list'] self.count = len(self.preprocess_files) self.perf_count = perf_count if perf_count is not None else self.count print("Found {:d} preprocessed files".format(self.count)) print("Using performance count = {:d}".format(self.perf_count)) self.loaded_files = {} self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
def benchmark_using_loadgen(): "Perform the benchmark using python API for the LoadGen library" scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[LOADGEN_SCENARIO] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[LOADGEN_MODE] ts = lg.TestSettings() ts.FromConfig(MLPERF_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.FromConfig(USER_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.scenario = scenario ts.mode = mode sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def benchmark_using_loadgen(): "Perform the benchmark using python API for the LoadGen library" global model # Load the [cached] Torch model torchvision_version = '' # master by default try: import torchvision torchvision_version = ':v' + torchvision.__version__ except Exception: pass model = torch.hub.load('pytorch/vision' + torchvision_version, MODEL_NAME, pretrained=True) model.eval() # move the model to GPU for speed if available if USE_CUDA: model.to('cuda') scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[LOADGEN_SCENARIO] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[LOADGEN_MODE] ts = lg.TestSettings() ts.FromConfig(MLPERF_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.FromConfig(USER_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.scenario = scenario ts.mode = mode if LOADGEN_MULTISTREAMNESS: ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) if LOADGEN_COUNT_OVERRIDE: ts.min_query_count = int(LOADGEN_COUNT_OVERRIDE) ts.max_query_count = int(LOADGEN_COUNT_OVERRIDE) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.PerformanceOnly sut = mlperf_loadgen.ConstructSUT( issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL( 1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.Offline settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.offline_expected_qps = 1000 sut = mlperf_loadgen.ConstructSUT( issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL( 1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.MultiStream settings.mode = mlperf_loadgen.TestMode.SubmissionRun settings.samples_per_query = 4 settings.target_qps = 1000 settings.target_latency_ns = 1000000000 sut = mlperf_loadgen.ConstructSUT(issue_query) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def __init__(self, preprocessed_data_dir, perf_count): print("Constructing QSL...") self.preprocessed_data_dir = preprocessed_data_dir with open(os.path.join(self.preprocessed_data_dir, "preprocessed_files.pkl"), "rb") as f: self.preprocess_files = pickle.load(f) self.count = len(self.preprocess_files) self.perf_count = perf_count if perf_count is not None else self.count print("Found {:d} preprocessed files".format(self.count)) print("Using performance count = {:d}".format(self.perf_count)) self.loaded_files = {} self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.AccuracyOnly settings.single_stream_expected_latency_ns = 1000000 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.SingleStream settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.single_stream_expected_latency_ns = 1000000 settings.enable_spec_overrides = True settings.override_target_latency_ns = 100000000 settings.override_min_query_count = 100 settings.override_min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def main(argv): settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.Server settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.server_target_qps = 100 settings.server_target_latency_ns = 100000000 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def __init__(self, perf_count=None, cache_path='eval_features.pickle'): print("Constructing QSL...") eval_features = [] # Load features if cached, convert from examples otherwise. if os.path.exists(cache_path): print("Loading cached features from '%s'..." % cache_path) with open(cache_path, 'rb') as cache_file: eval_features = pickle.load(cache_file) else: print("No cached features at '%s'... converting from examples..." % cache_path) print("Creating tokenizer...") tokenizer = BertTokenizer( "build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt") print("Reading examples...") eval_examples = read_squad_examples( input_file="build/data/dev-v1.1.json", is_training=False, version_2_with_negative=False) print("Converting examples to features...") def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature, verbose_logging=False) print("Caching features at '%s'..." % cache_path) with open(cache_path, 'wb') as cache_file: pickle.dump(eval_features, cache_file) self.eval_features = eval_features self.count = len(self.eval_features) self.perf_count = perf_count if perf_count is not None else self.count self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples) print("Finished constructing QSL.")
def benchmark_using_loadgen(): "Perform the benchmark using python API for the LoadGen library" global num_classes global model_output_volume pycuda_context, max_batch_size, input_volume, model_output_volume, num_layers = initialize_predictor( ) num_classes = len(class_labels) scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[LOADGEN_SCENARIO] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[LOADGEN_MODE] ts = lg.TestSettings() ts.FromConfig(MLPERF_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.FromConfig(USER_CONF_PATH, MODEL_NAME, LOADGEN_SCENARIO) ts.scenario = scenario ts.mode = mode if LOADGEN_MULTISTREAMNESS: ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) if LOADGEN_COUNT_OVERRIDE: ts.min_query_count = int(LOADGEN_COUNT_OVERRIDE) ts.max_query_count = int(LOADGEN_COUNT_OVERRIDE) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut) pycuda_context.pop()
def __init__(self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None): m_paths = [manifest_filepath] self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), normalize=True, max_duration=15.0) self.sample_rate = sample_rate self.count = len(self.manifest) perf_count = self.count if perf_count is None else perf_count self.sample_id_to_sample = {} self.qsl = lg.ConstructQSL(self.count, perf_count, self.load_query_samples, self.unload_query_samples) print( "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format( self.manifest.duration / 3600, self.manifest.filtered_duration / 3600, self.count))
def main(argv): del argv settings = mlperf_loadgen.TestSettings() settings.scenario = mlperf_loadgen.TestScenario.MultiStreamFree settings.mode = mlperf_loadgen.TestMode.PerformanceOnly settings.multi_stream_target_latency_ns = 100000000 settings.multi_stream_samples_per_query = 4 settings.multi_stream_max_async_queries = 2 settings.min_query_count = 100 settings.min_duration_ms = 10000 sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries, process_latencies) qsl = mlperf_loadgen.ConstructQSL(1024, 128, load_samples_to_ram, unload_samples_from_ram) mlperf_loadgen.StartTest(sut, qsl, settings) mlperf_loadgen.DestroyQSL(qsl) mlperf_loadgen.DestroySUT(sut)
def __init__(self, session, ds, optimization_config, onnx_output_names): self.session = session self.threads = optimization_config.threads_num self.max_batchsize = optimization_config.dynamic_batching_size self.ds = ds self.onnx_output_names = onnx_output_names self.guess = None self.cv = threading.Condition() self.done = False self.q_idx = [] self.q_query_id = [] self.workers = [] self.settings = lg.TestSettings() self.settings.scenario = lg.TestScenario.Server self.settings.mode = lg.TestMode.FindPeakPerformance log_output_settings = lg.LogOutputSettings() log_output_settings.outdir = optimization_config.result_path log_output_settings.copy_summary_to_stdout = False self.log_settings = lg.LogSettings() self.log_settings.enable_trace = False self.log_settings.log_output = log_output_settings self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) self.qsl = lg.ConstructQSL(QUERY_COUNT, QUERY_COUNT, ds.load_query_samples, ds.unload_query_samples) self.settings.server_coalesce_queries = True self.settings.server_target_latency_ns = int(optimization_config.max_latency_ms * NANO_SEC / MILLI_SEC) self.settings.server_target_latency_percentile = optimization_config.max_latency_percentile self.settings.min_duration_ms = optimization_config.min_duration_sec * MILLI_SEC # start all threads for _ in range(self.threads): worker = threading.Thread(target=self.handle_tasks, args=(self.cv,)) worker.daemon = True self.workers.append(worker) worker.start() time.sleep(1)
def start(self): """Starts the load test.""" settings = self.get_test_settings() log_settings = lg.LogSettings() log_settings.log_output.outdir = tempfile.mkdtemp() log_settings.log_output.copy_detail_to_stdout = True log_settings.log_output.copy_summary_to_stdout = True log_settings.enable_trace = False logging.info("Constructing SUT.") sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_metrics) logging.info("Constructing QSL.") qsl = lg.ConstructQSL(self.total_sample_count, self.performance_sample_count, self.load_samples, self.unload_samples) logging.info("Starting test.") lg.StartTestWithLogSettings(sut, qsl, settings, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def benchmark_using_loadgen(): "Perform the benchmark using python API for the LoadGen library" global pycuda_context initialize_predictor() scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[LOADGEN_SCENARIO] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[LOADGEN_MODE] ts = lg.TestSettings() if LOADGEN_CONF_FILE: ts.FromConfig(LOADGEN_CONF_FILE, 'random_model_name', LOADGEN_SCENARIO) ts.scenario = scenario ts.mode = mode if LOADGEN_MULTISTREAMNESS: ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut) pycuda_context.pop()
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count = args.count if not count: if not args.accuracy: count = 200 # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) for scenario in args.scenario: runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if scenario == lg.TestScenario.SingleStream: settings.min_query_count = args.queries_single settings.max_query_count = args.queries_single elif scenario == lg.TestScenario.MultiStream: settings.min_query_count = args.queries_multi settings.max_query_count = args.queries_multi settings.multi_stream_samples_per_query = 4 elif scenario == lg.TestScenario.Server: max_latency = args.max_latency elif scenario == lg.TestScenario.Offline: settings.min_query_count = args.queries_offline settings.max_query_count = args.queries_offline sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1000), ds.load_query_samples, ds.unload_query_samples) if scenario == lg.TestScenario.Server: for target_latency in max_latency: log.info("starting {}, latency={}".format( scenario, target_latency)) settings.server_target_latency_ns = int(target_latency * NANO_SEC) result_dict = { "good": 0, "total": 0, "scenario": str(scenario) } runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) add_results(final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) else: log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=os.path.dirname(args.output)) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open(args.output, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): img, _ = ds.get_samples([0]) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC #settings.max_duration_ms = args.time * MILLI_SEC if count_override: settings.min_query_count = count # settings.max_query_count = count if args.min_query_count: settings.min_query_count = args.min_query_count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.single_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) def set_qps(current_qps): settings.server_target_qps = current_qps settings.offline_expected_qps = current_qps settings.multi_stream_target_qps = current_qps return current_qps if args.qps: qps = set_qps(args.qps) lower_qps = -1 upper_qps = -1 qps_passed = {} while True: print("schedual qps:", qps) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 500), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) print("max query count:", settings.max_query_count) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) took = time.time() - ds.last_loaded add_results(final_results, "{}".format(scenario), result_dict, last_timeing, took, args.accuracy) lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) if args.scenario != 'Server': break if args.auto_qps is False: break if lower_qps == -1 or upper_qps == -1: base_qps = len(last_timeing) / took upper_qps = base_qps * 1.5 lower_qps = base_qps * 0.5 qps = set_qps(lower_qps) continue latency_percentile_ns = np.percentile( last_timeing, settings.server_target_latency_percentile * 100) * NANO_SEC if latency_percentile_ns < settings.server_target_latency_ns and qps > upper_qps * 0.98: print("target qps:", qps) break if upper_qps - lower_qps < 1 and lower_qps in qps_passed: print("target qps:", lower_qps) break if latency_percentile_ns > settings.server_target_latency_ns: #reduce qps print("reduce qps, bound:[%d, %d]" % (lower_qps, upper_qps)) upper_qps = qps if qps == lower_qps: lower_qps = lower_qps * 0.5 qps = set_qps(lower_qps) continue if qps > lower_qps: qps = set_qps((lower_qps + upper_qps) / 2) continue if latency_percentile_ns < settings.server_target_latency_ns: #increase qps qps_passed[qps] = None print("increase qps, bound:[%d, %d]" % (lower_qps, upper_qps)) lower_qps = qps if qps == upper_qps: upper_qps = upper_qps * 1.5 qps = set_qps(upper_qps) continue if qps < upper_qps: qps = set_qps((lower_qps + upper_qps) / 2) continue runner.finish()
def main(): global num_sockets global start_time global item_total global last_timeing args = get_args() log.info(args) config = os.path.abspath(args.config) user_config = os.path.abspath(args.user_config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if not os.path.exists(user_config): log.error("{} not found".format(user_config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) total_samples = multiprocessing.Value("i", 0) dsQueue = multiprocessing.Queue() outQueue = multiprocessing.Queue() inQueue = multiprocessing.JoinableQueue(num_sockets * 4) consumers = [ Consumer(inQueue, outQueue, dsQueue, lock, init_counter, total_samples, i, args) for i in range(num_sockets) ] for c in consumers: c.start() # Wait until subprocess ready while init_counter.value < num_sockets: time.sleep(2) # Start response thread response_worker = threading.Thread(target=response_loadgen, args=(outQueue, args.accuracy)) response_worker.daemon = True response_worker.start() scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](inQueue, max_batchsize=args.max_batchsize) def issue_queries(response_ids, query_sample_indexes): runner.enqueue(response_ids, query_sample_indexes) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model, args.scenario) settings.FromConfig(user_config, args.model, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.performance_sample_count_override = total_samples.value if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) def load_query_samples(sample_list): # Wait until subprocess ready global start_time for _ in range(num_sockets): dsQueue.put(sample_list) while init_counter.value < 2 * num_sockets: time.sleep(2) start_time = time.time() def unload_query_samples(sample_list): pass import torch import criteo sut = lg.ConstructFastSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL( total_samples.value, min(total_samples.value, args.samples_per_query_offline), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) result_dict = { "good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario) } lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = item_timing if args.accuracy: result_dict["good"] = item_good result_dict["total"] = item_total result_dict["roc_auc"] = criteo.auc_score(item_results) final_results = { "runtime": "pytorch-native-dlrm", "version": torch.__version__, "time": int(time.time()), "cmdline": str(args), } add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - start_time, args.accuracy) inQueue.join() for _ in consumers: inQueue.put(None) for c in consumers: c.join() outQueue.put(None) lg.DestroyQSL(qsl) lg.DestroyFastSUT(sut) # write final results if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global num_ins global num_cpus global in_queue_cnt global out_queue_cnt global batching global queries_so_far global Latencies queries_so_far = 0 args = get_args() log.info(args) scenario = args.scenario accuracy_mode = args.accuracy perf_count = args.perf_count batch_size = args.batch_size num_ins = args.num_instance num_cpus = args.num_phy_cpus batching = args.batching # Read Loadgen and workload config parameters settings = lg.TestSettings() settings.scenario = scenario_map[scenario] settings.FromConfig(args.mlperf_conf, "bert", scenario) settings.FromConfig(args.user_conf, "bert", scenario) settings.mode = lg.TestMode.AccuracyOnly if accuracy_mode else lg.TestMode.PerformanceOnly # Establish communication queues lock = multiprocessing.Lock() init_counter = multiprocessing.Value("i", 0) calibrate_counter = multiprocessing.Value("i", 0) out_queue = multiprocessing.Queue() # Create consumers consumers = [] if scenario == "Server": from parse_server_config import configParser buckets = configParser("machine_conf.json") cutoffs = list(buckets.keys()) batch_sizes = {} in_queue = {j: multiprocessing.JoinableQueue() for j in buckets} proc_idx = 0 num_cpus = 0 total_ins = 0 for cutoff in list(buckets.keys()): batch_sizes[cutoff] = buckets[cutoff]["batch_size"] num_ins = buckets[cutoff]["instances"] cpus_per_instance = buckets[cutoff]["cpus_per_instance"] num_cpus = num_ins * cpus_per_instance total_ins += num_ins for j in range(num_ins): consumer = Consumer(in_queue[cutoff], out_queue, lock, init_counter, calibrate_counter, proc_idx, num_ins, args, cutoff) consumer.start_core_idx = proc_idx consumer.end_core_idx = proc_idx + cpus_per_instance - 1 consumers.append(consumer) proc_idx = consumer.end_core_idx + 1 num_ins = total_ins else: total_ins = num_ins in_queue = MultiprocessShapeBasedQueue() consumers = [ Consumer(in_queue, out_queue, lock, init_counter, calibrate_counter, i, num_ins, args) for i in range(num_ins) ] for c in consumers: c.start() # Dataset object used by constructQSL data_set = BERTDataSet(args.vocab, args.perf_count) if scenario == "Server": issue_queue = InQueueServer(in_queue, batch_sizes, data_set, settings.min_query_count) else: issue_queue = InQueue(in_queue, batch_size, data_set) # Wait until all sub-processors are ready block_until(init_counter, total_ins, 2) # Start response thread response_worker = threading.Thread(target=response_loadgen, args=(out_queue, )) response_worker.daemon = True response_worker.start() def issue_queries(query_samples): # It's called by loadgen to send query to SUT issue_queue.put(query_samples) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(data_set.count, data_set.perf_count, load_query_samples, unload_query_samples) log_path = "build/logs" if not os.path.exists(log_path): os.makedirs(log_path) log_output_settings = lg.LogOutputSettings() log_output_settings.outdir = log_path log_output_settings.copy_summary_to_stdout = True log_settings = lg.LogSettings() log_settings.log_output = log_output_settings lg.StartTestWithLogSettings(sut, qsl, settings, log_settings) # Wait until outQueue done while out_queue_cnt < in_queue_cnt: time.sleep(0.2) if scenario == "Server": for i in in_queue: in_queue[i].join() for j in range(buckets[i]["cpus_per_instance"]): in_queue[i].put(None) else: for i in range(num_ins): in_queue.put(None) for c in consumers: c.join() out_queue.put(None) if accuracy_mode: cmd = "python accuracy-squad.py --log_file={}/mlperf_log_accuracy.json".format( log_path) subprocess.check_call(cmd, shell=True) lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def main(): global so global last_timeing global last_loaded global result_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True """ Python signature go_initialize(backend, model_path, dataset_path, count, use_gpu, gpu_id, trace_level, max_batchsize) """ count, err = go_initialize(backend, args.model_path, args.dataset_path, count, args.use_gpu, args.gpu_id, args.trace_level, args.max_batchsize) if (err != 'nil'): print(err) raise RuntimeError('initialization in go failed') mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) log_dir = None if args.log_dir: log_dir = os.path.abspath(args.log_dir) os.makedirs(log_dir, exist_ok=True) scenario = SCENARIO_MAP[args.scenario] def issue_queries(query_samples): global so global last_timeing global result_timeing idx = np.array([q.index for q in query_samples]).astype(np.int32) query_id = [q.id for q in query_samples] if args.dataset == 'brats2019': start = time.time() response_array_refs = [] response = [] for i, qid in enumerate(query_id): processed_results = so.IssueQuery(1, idx[i][np.newaxis]) processed_results = json.loads( processed_results.decode('utf-8')) response_array = array.array( "B", np.array(processed_results[0], np.float16).tobytes()) response_array_refs.append(response_array) bi = response_array.buffer_info() response.append(lg.QuerySampleResponse(qid, bi[0], bi[1])) result_timeing.append(time.time() - start) lg.QuerySamplesComplete(response) else: start = time.time() processed_results = so.IssueQuery(len(idx), idx) result_timeing.append(time.time() - start) processed_results = json.loads(processed_results.decode('utf-8')) response_array_refs = [] response = [] for idx, qid in enumerate(query_id): response_array = array.array( "B", np.array(processed_results[idx], np.float32).tobytes()) response_array_refs.append(response_array) bi = response_array.buffer_info() response.append(lg.QuerySampleResponse(qid, bi[0], bi[1])) lg.QuerySamplesComplete(response) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] def load_query_samples(sample_list): global so global last_loaded err = go_load_query_samples(sample_list, so) last_loaded = time.time() if (err != ''): print(err) raise RuntimeError('load query samples failed') def unload_query_samples(sample_list): global so err = go_unload_query_samples(sample_list, so) if (err != ''): print(err) raise RuntimeError('unload query samples failed') settings = lg.TestSettings() if args.model_name != "": settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if count_override: settings.min_query_count = count settings.max_query_count = count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 500), load_query_samples, unload_query_samples) log.info("starting {}".format(scenario)) log_path = os.path.realpath(args.log_dir) log_output_settings = lg.LogOutputSettings() log_output_settings.outdir = log_path log_output_settings.copy_summary_to_stdout = True log_settings = lg.LogSettings() log_settings.log_output = log_output_settings # log_settings.enable_trace = True # lg.StartTest(sut, qsl, settings) lg.StartTestWithLogSettings(sut, qsl, settings, log_settings) if not last_timeing: last_timeing = result_timeing if args.accuracy: accuracy_script_paths = { 'coco': os.path.realpath( '../inference/vision/classification_and_detection/tools/accuracy-coco.py' ), 'imagenet': os.path.realpath( '../inference/vision/classification_and_detection/tools/accuracy-imagenet.py' ), 'squad': os.path.realpath('../inference/language/bert/accuracy-squad.py'), 'brats2019': os.path.realpath( '../inference/vision/medical_imaging/3d-unet/accuracy-brats.py' ), } accuracy_script_path = accuracy_script_paths[args.dataset] accuracy_file_path = os.path.join(log_dir, 'mlperf_log_accuracy.json') data_dir = os.environ['DATA_DIR'] if args.dataset == 'coco': if args.use_inv_map: subprocess.check_call( 'python3 {} --mlperf-accuracy-file {} --coco-dir {} --use-inv-map' .format(accuracy_script_path, accuracy_file_path, data_dir), shell=True) else: subprocess.check_call( 'python3 {} --mlperf-accuracy-file {} --coco-dir {}'. format(accuracy_script_path, accuracy_file_path, data_dir), shell=True) elif args.dataset == 'imagenet': # imagenet subprocess.check_call( 'python3 {} --mlperf-accuracy-file {} --imagenet-val-file {}'. format(accuracy_script_path, accuracy_file_path, os.path.join(data_dir, 'val_map.txt')), shell=True) elif args.dataset == 'squad': # squad vocab_path = os.path.join(data_dir, 'vocab.txt') val_path = os.path.join(data_dir, 'dev-v1.1.json') out_path = os.path.join(log_dir, 'predictions.json') cache_path = os.path.join(data_dir, 'eval_features.pickle') subprocess.check_call( 'python3 {} --vocab_file {} --val_data {} --log_file {} --out_file {} --features_cache_file {} --max_examples {}' .format(accuracy_script_path, vocab_path, val_path, accuracy_file_path, out_path, cache_path, count), shell=True) elif args.dataset == 'brats2019': # brats2019 base_dir = os.path.realpath( '../inference/vision/medical_imaging/3d-unet/build') post_dir = os.path.join(base_dir, 'postprocessed_data') label_dir = os.path.join( base_dir, 'raw_data/nnUNet_raw_data/Task043_BraTS2019/labelsTr') os.makedirs(post_dir, exist_ok=True) subprocess.check_call( 'python3 {} --log_file {} --preprocessed_data_dir {} --postprocessed_data_dir {} --label_data_dir {}' .format(accuracy_script_path, accuracy_file_path, data_dir, post_dir, label_dir), shell=True) else: raise RuntimeError('Dataset not Implemented.') lg.DestroyQSL(qsl) lg.DestroySUT(sut) """ Python signature go_finalize(so) """ err = go_finalize(so) if (err != ''): print(err) raise RuntimeError('finialize in go failed')
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) # override image format if given image_format = args.data_format if args.data_format else backend.image_format() # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } config = os.path.abspath(args.config) if not os.path.exists(config): log.error("{} not found".format(config)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup warmup_queries = range(args.max_batchsize) ds.load_query_samples(warmup_queries) for _ in range(2): img, _ = ds.get_samples(warmup_queries) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(config, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if count_override: settings.min_query_count = count settings.max_query_count = count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) # override target latency when it needs to be less than 1ms if args.model_name == "mobilenet": settings.single_stream_expected_latency_ns = 200000 elif args.model_name == "resnet50": settings.single_stream_expected_latency_ns = 900000 elif args.model_name == "ssd-mobilenet": settings.single_stream_expected_latency_ns = 1000000 sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) #qsl = lg.ConstructQSL(count, min(count, 500), ds.load_query_samples, ds.unload_query_samples) qsl = lg.ConstructQSL(count, min(count, 1024), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) if args.enable_trace: lg.StartTest(sut, qsl, settings) else: logsettings = lg.LogSettings() logsettings.enable_trace = False lg.StartTestWithLogSettings(sut, qsl, settings, logsettings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend, args.dataset, args.max_ind_range, args.data_sub_sample_rate, args.use_gpu) # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset] # --count-samples can be used to limit the number of samples used for testing ds = wanted_dataset(data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function use_cache=args.cache, # currently not used count=args.count_samples, samples_to_aggregate_fix=args.samples_to_aggregate_fix, samples_to_aggregate_min=args.samples_to_aggregate_min, samples_to_aggregate_max=args.samples_to_aggregate_max, samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file, samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file, test_num_workers=args.test_num_workers, max_ind_range=args.max_ind_range, sub_sample_rate=args.data_sub_sample_rate, mlperf_bin_loader=args.mlperf_bin_loader, **kwargs) # load model to backend model = backend.load(args.model_path, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup ds.load_query_samples([0]) for _ in range(5): batch_dense_X, batch_lS_o, batch_lS_i, _, _ = ds.get_samples([0]) _ = backend.predict(batch_dense_X, batch_lS_o, batch_lS_i) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(mlperf_conf, args.model_path, args.scenario) settings.FromConfig(user_conf, args.model_path, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.duration: settings.min_duration_ms = args.duration settings.max_duration_ms = args.duration if args.target_qps: settings.server_target_qps = float(args.target_qps) settings.offline_expected_qps = float(args.target_qps) if args.count_queries: settings.min_query_count = args.count_queries settings.max_query_count = args.count_queries if args.samples_per_query_multistream: settings.multi_stream_samples_per_query = args.samples_per_query_multistream if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4)
def benchmark_using_loadgen(): "Perform the benchmark using python API for the LoadGen library" global funnel_should_be_running, warmup_mode, openme_data scenario = { 'SingleStream': lg.TestScenario.SingleStream, 'MultiStream': lg.TestScenario.MultiStream, 'Server': lg.TestScenario.Server, 'Offline': lg.TestScenario.Offline, }[LOADGEN_SCENARIO] mode = { 'AccuracyOnly': lg.TestMode.AccuracyOnly, 'PerformanceOnly': lg.TestMode.PerformanceOnly, 'SubmissionRun': lg.TestMode.SubmissionRun, }[LOADGEN_MODE] ts = lg.TestSettings() if LOADGEN_CONFIG_FILE: ts.FromConfig(LOADGEN_CONFIG_FILE, 'random_model_name', LOADGEN_SCENARIO) ts.scenario = scenario ts.mode = mode if LOADGEN_MULTISTREAMNESS: ts.multi_stream_samples_per_query = int(LOADGEN_MULTISTREAMNESS) if LOADGEN_MAX_DURATION_S: ts.max_duration_ms = int(LOADGEN_MAX_DURATION_S)*1000 if LOADGEN_COUNT_OVERRIDE: ts.min_query_count = int(LOADGEN_COUNT_OVERRIDE) ts.max_query_count = int(LOADGEN_COUNT_OVERRIDE) if LOADGEN_TARGET_QPS: target_qps = float(LOADGEN_TARGET_QPS) ts.multi_stream_target_qps = target_qps ts.server_target_qps = target_qps ts.offline_expected_qps = target_qps sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(LOADGEN_DATASET_SIZE, LOADGEN_BUFFER_SIZE, load_query_samples, unload_query_samples) log_settings = lg.LogSettings() log_settings.enable_trace = False funnel_thread = threading.Thread(target=send_responses, args=()) funnel_should_be_running = True funnel_thread.start() if LOADGEN_WARMUP_SAMPLES: warmup_id_range = list(range(LOADGEN_WARMUP_SAMPLES)) load_query_samples(warmup_id_range) warmup_mode = True print("Sending out the warm-up samples, waiting for responses...") issue_queries([lg.QuerySample(id,id) for id in warmup_id_range]) while len(in_progress)>0: # waiting for the in_progress queue to clear up time.sleep(1) print(" Done!") warmup_mode = False lg.StartTestWithLogSettings(sut, qsl, ts, log_settings) funnel_should_be_running = False # politely ask the funnel_thread to end funnel_thread.join() # wait for it to actually end from_workers.close() to_workers.close() lg.DestroyQSL(qsl) lg.DestroySUT(sut) if SIDELOAD_JSON: with open(SIDELOAD_JSON, 'w') as sideload_fd: json.dump(openme_data, sideload_fd, indent=4, sort_keys=True)
def run(): """Runs the offline mode.""" global last_timing # Initiazation final_results, count, runner = setup() # # run the benchmark with timing # runner.start_pool() def issue_query_offline(query_samples): """Adds query to the queue.""" for i in [1]: idx = np.array([q.index for q in query_samples]) query_id = np.array([q.id for q in query_samples]) batch_size = FLAGS.batch_size[0] for i in range(0, len(query_samples), batch_size): runner.enqueue(query_id[i:i + batch_size], idx[i:i + batch_size]) def flush_queries(): pass def process_latencies(latencies_ns): global last_timing last_timing = [t / 1e9 for t in latencies_ns] sut = lg.ConstructSUT(issue_query_offline, flush_queries, process_latencies) masters = [] outdir = FLAGS.outdir if FLAGS.outdir else tempfile.mkdtemp() export_outdir = FLAGS.export_outdir if FLAGS.export_outdir else outdir export_outdir = os.path.join(export_outdir, "export_model") def load_query_samples(sample_list): """Load query samples.""" runner.ds.load_query_samples(sample_list) # Find tpu master. if FLAGS.num_tpus == 1: runner.model.update_qsl(runner.ds.get_image_list_inmemory()) else: for i in range(FLAGS.num_tpus): runner.models[i].update_qsl(runner.ds.get_image_list_inmemory()) def warmup(): """Warmup the TPUs.""" load_query_samples([0]) if FLAGS.num_tpus == 1: log.info("warmup ...") runner.warmup(0) log.info("warmup done") else: for cloud_tpu_id in range(FLAGS.num_tpus): log.info("warmup %d...", cloud_tpu_id) runner.warmup(0, cloud_tpu_id) log.info("warmup %d done", cloud_tpu_id) # After warmup, give the system a moment to quiesce before putting it under # load. time.sleep(1) if FLAGS.num_tpus == 1: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) master = tpu_cluster_resolver.get_master() runner.model.build_and_export( FLAGS.model, export_model_path=export_outdir, batch_size=FLAGS.batch_size, master=master, scenario=FLAGS.scenario) runner.model.load(export_model_path=export_outdir, master=master) else: # Use the first TPU instance to build and export the graph. tpu_names = FLAGS.tpu_name tpu_names = tpu_names.split(",") for tpu_name in tpu_names: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) masters.append(tpu_cluster_resolver.get_master()) runner.models[0].build_and_export( FLAGS.model, export_model_path=export_outdir, batch_size=FLAGS.batch_size, master=masters[0], scenario=FLAGS.scenario) def init_fn(cloud_tpu_id): """Init and warmup each cloud tpu.""" runner.models[cloud_tpu_id].load( export_model_path=export_outdir, master=masters[cloud_tpu_id]) threads = [] for i in range(FLAGS.num_tpus): thread = threading.Thread(target=init_fn, args=(i,)) threads.append(thread) thread.start() for thread in threads: thread.join() warmup() qsl = lg.ConstructQSL(count, min(count, 1024), load_query_samples, runner.ds.unload_query_samples) test_scenarios = FLAGS.scenario if test_scenarios is None: test_scenarios_list = [] else: test_scenarios_list = test_scenarios.split(",") max_latency = FLAGS.max_latency max_latency_list = max_latency.split(",") for scenario in test_scenarios_list: for target_latency in max_latency_list: log.info("starting %s, latency=%s", scenario, target_latency) settings = lg.TestSettings() log.info(scenario) if FLAGS.accuracy: settings.mode = lg.TestMode.AccuracyOnly settings.scenario = utils.SCENARIO_MAP[scenario] if FLAGS.qps: qps = float(FLAGS.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if FLAGS.time: settings.min_duration_ms = 60 * MILLI_SEC settings.max_duration_ms = 0 qps = FLAGS.qps or 100 settings.min_query_count = qps * FLAGS.time settings.max_query_count = int(1.1 * qps * FLAGS.time) else: settings.min_query_count = (1 << 21) if FLAGS.time or FLAGS.qps and FLAGS.accuracy: settings.mode = lg.TestMode.PerformanceOnly # FIXME: add SubmissionRun once available target_latency_ns = int(float(target_latency) * (NANO_SEC / MILLI_SEC)) settings.single_stream_expected_latency_ns = target_latency_ns settings.multi_stream_target_latency_ns = target_latency_ns settings.server_target_latency_ns = target_latency_ns log_settings = lg.LogSettings() # TODO(brianderson): figure out how to use internal file path. log_settings.log_output.outdir = tempfile.mkdtemp() log_settings.log_output.copy_detail_to_stdout = True log_settings.log_output.copy_summary_to_stdout = True log_settings.enable_trace = False result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, FLAGS.accuracy) lg.StartTestWithLogSettings(sut, qsl, settings, log_settings) if FLAGS.accuracy: runner.get_post_process().finalize(result_dict, runner.ds) utils.add_results( final_results, "{}-{}".format(scenario, target_latency), result_dict, last_timing, time.time() - runner.ds.last_loaded) # # write final results # if FLAGS.outdir: outfile = os.path.join(FLAGS.outdir, "results.txt") with tf.gfile.Open(outfile, "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) else: json.dump(final_results, sys.stdout, sort_keys=True, indent=4) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut)
def main(): global last_timeing args = get_args() log.info(args) # find backend backend = get_backend(args.backend) if getattr(backend, "max_batchsize", -1) != -1: backend.max_batchsize = args.max_batchsize # override image format if given image_format = args.data_format if args.data_format else backend.image_format( ) # --count applies to accuracy mode only and can be used to limit the number of images # for testing. For perf model we always limit count to 200. count_override = False count = args.count if count: count_override = True # dataset to use wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[ args.dataset] ds = wanted_dataset(data_path=args.dataset_path, image_list=args.dataset_list, name=args.dataset, image_format=image_format, pre_process=pre_proc, use_cache=args.cache, count=count, **kwargs) # load model to backend model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs) final_results = { "runtime": model.name(), "version": model.version(), "time": int(time.time()), "cmdline": str(args), } mlperf_conf = os.path.abspath(args.mlperf_conf) if not os.path.exists(mlperf_conf): log.error("{} not found".format(mlperf_conf)) sys.exit(1) user_conf = os.path.abspath(args.user_conf) if not os.path.exists(user_conf): log.error("{} not found".format(user_conf)) sys.exit(1) audit_config_cp_loc = None if args.output: output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) # Check if audit.config file is used, copy to output directory before # we chdir to that location so loadgen can find it audit_files = glob.glob( "ncoresw/mlperf/vision/classification_and_detection/*audit.config") if len(audit_files): log.info("Found audit.config (" + audit_files[0] + ")") audit_config_cp_loc = os.path.join(output_dir, "audit.config") # If user already put audit.config at `output` directory, then use # that one. Otherwise, copy the one we found in the current # directory (before chdir to new output directory). if os.path.exists(audit_config_cp_loc): log.info( "WARNING: audit.config already exists, so cannot copy over new audit file!" ) log.info(audit_config_cp_loc) audit_config_cp_loc = None else: shutil.copy(audit_files[0], audit_config_cp_loc) os.chdir(output_dir) # # make one pass over the dataset to validate accuracy # count = ds.get_item_count() # warmup warmup_queries = range(args.max_batchsize) ds.load_query_samples(warmup_queries) for _ in range(2): img, _ = ds.get_samples(warmup_queries) _ = backend.predict({backend.inputs[0]: img}) ds.unload_query_samples(None) scenario = SCENARIO_MAP[args.scenario] runner_map = { lg.TestScenario.SingleStream: RunnerBase, lg.TestScenario.MultiStream: QueueRunner, lg.TestScenario.Server: QueueRunner, lg.TestScenario.Offline: QueueRunner } runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize) def issue_queries(query_samples): runner.enqueue(query_samples) def flush_queries(): pass def process_latencies(latencies_ns): # called by loadgen to show us the recorded latencies global last_timeing last_timeing = [t / NANO_SEC for t in latencies_ns] settings = lg.TestSettings() settings.FromConfig(mlperf_conf, args.model_name, args.scenario) settings.FromConfig(user_conf, args.model_name, args.scenario) settings.scenario = scenario settings.mode = lg.TestMode.PerformanceOnly if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly if args.find_peak_performance: settings.mode = lg.TestMode.FindPeakPerformance if args.time: # override the time we want to run settings.min_duration_ms = args.time * MILLI_SEC settings.max_duration_ms = args.time * MILLI_SEC if args.qps: qps = float(args.qps) settings.server_target_qps = qps settings.offline_expected_qps = qps if count_override: settings.min_query_count = count settings.max_query_count = count if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC) # override target latency when it needs to be less than 1ms if args.model_name == "mobilenet": settings.single_stream_expected_latency_ns = 200000 elif args.model_name == "resnet50": settings.single_stream_expected_latency_ns = 900000 elif args.model_name == "ssd-mobilenet": settings.single_stream_expected_latency_ns = 900000 sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies) qsl = lg.ConstructQSL(count, min(count, 1024), ds.load_query_samples, ds.unload_query_samples) log.info("starting {}".format(scenario)) result_dict = {"good": 0, "total": 0, "scenario": str(scenario)} runner.start_run(result_dict, args.accuracy) lg.StartTest(sut, qsl, settings) if not last_timeing: last_timeing = runner.result_timing if args.accuracy: post_proc.finalize(result_dict, ds, output_dir=args.output) add_results(final_results, "{}".format(scenario), result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy) runner.finish() lg.DestroyQSL(qsl) lg.DestroySUT(sut) # Dump the summary logs to stdout for convenience log.info("Output dir: " + os.path.abspath(output_dir)) with open(os.path.join(output_dir, "mlperf_log_summary.txt"), 'r') as f: log.info(f.read()) # Output accuracy txt file if args.accuracy: with open(os.path.join(output_dir, "accuracy.txt"), "w") as f_acc: # SSD accuracy calculation #---------------------------------------- # The mAP is already stored in result_dict["mAP"], but we'll call # `accuracy_coco()` just to keep the submission process consistent. if args.model_name == "ssd-mobilenet": accuracy_str = accuracy.CocoAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), coco_dir=args.dataset_path).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) if args.model_name == "ssd-resnet34": accuracy_str = accuracy.CocoAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), coco_dir=args.dataset_path, use_inv_map=True, remove_48_empty_images=False).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) # ImageNet accuracy calculation #---------------------------------------- # The good / total values are already stored in result_dict["good"] # and result_dict["total"], but we'll call `accuracy_imagenet()` # just to keep the submission process consistent. else: accuracy_str = accuracy.ImagenetAcc( mlperf_accuracy_file=os.path.join( output_dir, "mlperf_log_accuracy.json"), imagenet_val_file=os.path.join( args.dataset_path, "val_map.txt")).get_accuracy() + "\n" f_acc.write(accuracy_str) log.info(accuracy_str) # # write final results # if args.output: with open("results.json", "w") as f: json.dump(final_results, f, sort_keys=True, indent=4) if audit_config_cp_loc != None: os.remove(audit_config_cp_loc) backend_destroy = getattr(backend, "destroy", None) if callable(backend_destroy): backend.destroy()