def create_exec_infer_model(self, model_dir, labels, num_requests=2): model_xml = os.path.join(model_dir, 'frozen_inference_graph.xml') model_bin = os.path.join(model_dir, 'frozen_inference_graph.bin') exported_model = os.path.join(model_dir, 'exported_model') assert os.path.isfile(model_bin) assert os.path.isfile(model_xml) ie = IECore() net = IENetwork(model=model_xml, weights=model_bin) # return ExecInferModel() img_info_input_blob = None feed_dict = {} for blob_name in net.inputs: if len(net.inputs[blob_name].shape) == 4: input_blob = blob_name elif len(net.inputs[blob_name].shape) == 2: img_info_input_blob = blob_name else: raise RuntimeError( "Unsupported {}D input layer '{}'. Only 2D and 4D input layers are supported" .format(len(net.inputs[blob_name].shape), blob_name)) assert len( net.outputs) == 1, "Demo supports only single output topologies" out_blob = next(iter(net.outputs)) if os.path.isfile(exported_model): # found exported mode print('found model to import') exec_net = ie.import_network(model_file=exported_model, device_name=self.device) else: print('creating exec model') exec_net = ie.load_network(network=net, num_requests=num_requests, device_name=self.device) exec_net.export(exported_model) n, c, h, w = net.inputs[input_blob].shape if img_info_input_blob: feed_dict[img_info_input_blob] = [h, w, 1] del net del ie return ExecInferModel(exec_net, input_blob, out_blob, feed_dict, n, c, h, w, num_requests, labels)
class Benchmark: def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None, duration_seconds: int = None, api_type: str = 'async'): self.device = device self.ie = IECore() self.nireq = number_infer_requests self.niter = number_iterations self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device) self.api_type = api_type def __del__(self): del self.ie def add_extension(self, path_to_extension: str = None, path_to_cldnn_config: str = None): if path_to_cldnn_config: self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME) logger.info( 'GPU extensions is loaded {}'.format(path_to_cldnn_config)) if path_to_extension: self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME) logger.info( 'CPU extensions is loaded {}'.format(path_to_extension)) def get_version_info(self) -> str: logger.info('InferenceEngine:\n{: <9}{:.<24} {}'.format( '', 'API version', get_version())) version_string = 'Device info\n' for device, version in self.ie.get_versions(self.device).items(): version_string += '{: <9}{}\n'.format('', device) version_string += '{: <9}{:.<24}{} {}.{}\n'.format( '', version.description, ' version', version.major, version.minor) version_string += '{: <9}{:.<24} {}\n'.format( '', 'Build', version.build_number) return version_string def set_config(self, config={}): for device in config.keys(): self.ie.set_config(config[device], device) def read_network(self, path_to_model: str): model_filename = os.path.abspath(path_to_model) head, ext = os.path.splitext(model_filename) weights_filename = os.path.abspath( head + BIN_EXTENSION) if ext == XML_EXTENSION else "" ie_network = self.ie.read_network(model_filename, weights_filename) return ie_network def load_network(self, ie_network: IENetwork, config={}): exe_network = self.ie.load_network( ie_network, self.device, config=config, num_requests=1 if self.api_type == 'sync' else self.nireq or 0) # Number of requests self.nireq = len(exe_network.requests) return exe_network def import_network(self, path_to_file: str, config={}): exe_network = self.ie.import_network( model_file=path_to_file, device_name=self.device, config=config, num_requests=1 if self.api_type == 'sync' else self.nireq or 0) # Number of requests self.nireq = len(exe_network.requests) return exe_network def first_infer(self, exe_network): infer_request = exe_network.requests[0] # warming up - out of scope if self.api_type == 'sync': infer_request.infer() else: infer_request.async_infer() status = exe_network.wait() if status != StatusCode.OK: raise Exception( "Wait for all requests is failed with status code {}!". format(status)) return infer_request.latency def infer(self, exe_network, batch_size, progress_bar=None): progress_count = 0 infer_requests = exe_network.requests start_time = datetime.utcnow() exec_time = 0 iteration = 0 times = [] in_fly = set() # Start inference & calculate performance # to align number if iterations to guarantee that last infer requests are executed in the same conditions **/ while (self.niter and iteration < self.niter) or \ (self.duration_seconds and exec_time < self.duration_seconds) or \ (self.api_type == 'async' and iteration % self.nireq): if self.api_type == 'sync': infer_requests[0].infer() times.append(infer_requests[0].latency) else: infer_request_id = exe_network.get_idle_request_id() if infer_request_id < 0: status = exe_network.wait(num_requests=1) if status != StatusCode.OK: raise Exception("Wait for idle request failed!") infer_request_id = exe_network.get_idle_request_id() if infer_request_id < 0: raise Exception("Invalid request id!") if infer_request_id in in_fly: times.append(infer_requests[infer_request_id].latency) else: in_fly.add(infer_request_id) infer_requests[infer_request_id].async_infer() iteration += 1 exec_time = (datetime.utcnow() - start_time).total_seconds() if progress_bar: if self.duration_seconds: # calculate how many progress intervals are covered by current iteration. # depends on the current iteration time and time of each progress interval. # Previously covered progress intervals must be skipped. progress_interval_time = self.duration_seconds / progress_bar.total_num new_progress = int(exec_time / progress_interval_time - progress_count) progress_bar.add_progress(new_progress) progress_count += new_progress elif self.niter: progress_bar.add_progress(1) # wait the latest inference executions status = exe_network.wait() if status != StatusCode.OK: raise Exception( "Wait for all requests is failed with status code {}!".format( status)) total_duration_sec = (datetime.utcnow() - start_time).total_seconds() for infer_request_id in in_fly: times.append(infer_requests[infer_request_id].latency) times.sort() latency_ms = median(times) fps = batch_size * 1000 / latency_ms if self.api_type == 'sync' else batch_size * iteration / total_duration_sec if progress_bar: progress_bar.finish() return fps, latency_ms, total_duration_sec, iteration
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = parse_args() # ---------------------------Step 1. Initialize inference engine core-------------------------------------------------- log.info('Creating Inference Engine') ie = IECore() # ---------------------------Step 2. Read a model in OpenVINO Intermediate Representation--------------- if args.model: log.info(f'Reading the network: {args.model}') # .xml and .bin files net = ie.read_network(model=args.model) # ---------------------------Step 3. Configure input & output---------------------------------------------------------- log.info('Configuring input and output blobs') # Mark layers from args.output_layers as outputs if args.output_layers: net.add_outputs(get_output_layer_list(net, args, with_ports=True)) # Get names of input and output blobs input_blobs = get_input_layer_list(net, args) output_blobs = get_output_layer_list(net, args, with_ports=False) # Set input and output precision manually for blob_name in input_blobs: net.input_info[blob_name].precision = 'FP32' for blob_name in output_blobs: net.outputs[blob_name].precision = 'FP32' net.batch_size = args.batch_size # ---------------------------Step 4. Loading model to the device------------------------------------------------------- devices = args.device.replace('HETERO:', '').split(',') plugin_config = {} if 'GNA' in args.device: gna_device_mode = devices[0] if '_' in devices[0] else 'GNA_AUTO' devices[0] = 'GNA' plugin_config['GNA_DEVICE_MODE'] = gna_device_mode plugin_config['GNA_PRECISION'] = f'I{args.quantization_bits}' # Set a GNA scale factor if args.import_gna_model: if args.scale_factor: log.warning(f'Custom scale factor will be used for imported GNA model: {args.import_gna_model}') set_scale_factors(plugin_config, parse_scale_factors(args)) else: log.info(f'Using scale factor from the imported GNA model: {args.import_gna_model}') else: if args.scale_factor: set_scale_factors(plugin_config, parse_scale_factors(args)) else: scale_factors = [] for file_name in re.split(', |,', args.input): first_utterance = next(iter(read_utterance_file(file_name).values())) scale_factors.append(get_scale_factor(first_utterance)) log.info('Using scale factor(s) calculated from first utterance') set_scale_factors(plugin_config, scale_factors) if args.export_embedded_gna_model: plugin_config['GNA_FIRMWARE_MODEL_IMAGE'] = args.export_embedded_gna_model plugin_config['GNA_FIRMWARE_MODEL_IMAGE_GENERATION'] = args.embedded_gna_configuration if args.performance_counter: plugin_config['PERF_COUNT'] = 'YES' device_str = f'HETERO:{",".join(devices)}' if 'HETERO' in args.device else devices[0] log.info('Loading the model to the plugin') if args.model: exec_net = ie.load_network(net, device_str, plugin_config) else: exec_net = ie.import_network(args.import_gna_model, device_str, plugin_config) input_blobs = get_input_layer_list(exec_net, args) output_blobs = get_output_layer_list(exec_net, args, with_ports=False) if args.input: input_files = re.split(', |,', args.input) if len(input_blobs) != len(input_files): log.error(f'Number of network inputs ({len(input_blobs)}) is not equal ' f'to number of ark files ({len(input_files)})') sys.exit(-3) if args.reference: reference_files = re.split(', |,', args.reference) if len(output_blobs) != len(reference_files): log.error('The number of reference files is not equal to the number of network outputs.') sys.exit(-5) if args.output: output_files = re.split(', |,', args.output) if len(output_blobs) != len(output_files): log.error('The number of output files is not equal to the number of network outputs.') sys.exit(-6) if args.export_gna_model: log.info(f'Writing GNA Model to {args.export_gna_model}') exec_net.export(args.export_gna_model) return 0 if args.export_embedded_gna_model: log.info(f'Exported GNA embedded model to file {args.export_embedded_gna_model}') log.info(f'GNA embedded model export done for GNA generation {args.embedded_gna_configuration}') return 0 # ---------------------------Step 5. Create infer request-------------------------------------------------------------- # load_network() method of the IECore class with a specified number of requests (default 1) returns an ExecutableNetwork # instance which stores infer requests. So you already created Infer requests in the previous step. # ---------------------------Step 6. Prepare input--------------------------------------------------------------------- file_data = [read_utterance_file(file_name) for file_name in input_files] input_data = { utterance_name: { input_blobs[i]: file_data[i][utterance_name] for i in range(len(input_blobs)) } for utterance_name in file_data[0].keys() } if args.reference: references = {output_blobs[i]: read_utterance_file(reference_files[i]) for i in range(len(output_blobs))} # ---------------------------Step 7. Do inference---------------------------------------------------------------------- log.info('Starting inference in synchronous mode') results = {blob_name: {} for blob_name in output_blobs} total_infer_time = 0 for i, key in enumerate(sorted(input_data)): start_infer_time = default_timer() # Reset states between utterance inferences to remove a memory impact for request in exec_net.requests: for state in request.query_state(): state.reset() result = infer_data(input_data[key], exec_net, input_blobs, output_blobs) for blob_name in result.keys(): results[blob_name][key] = result[blob_name] infer_time = default_timer() - start_infer_time total_infer_time += infer_time num_of_frames = file_data[0][key].shape[0] avg_infer_time_per_frame = infer_time / num_of_frames # ---------------------------Step 8. Process output-------------------------------------------------------------------- log.info('') log.info(f'Utterance {i} ({key}):') log.info(f'Total time in Infer (HW and SW): {infer_time * 1000:.2f}ms') log.info(f'Frames in utterance: {num_of_frames}') log.info(f'Average Infer time per frame: {avg_infer_time_per_frame * 1000:.2f}ms') for blob_name in output_blobs: log.info('') log.info(f'Output blob name: {blob_name}') log.info(f'Number scores per frame: {results[blob_name][key].shape[1]}') if args.reference: log.info('') compare_with_reference(results[blob_name][key], references[blob_name][key]) if args.performance_counter: if 'GNA' in args.device: pc = exec_net.requests[0].get_perf_counts() total_cycles = int(pc['1.1 Total scoring time in HW']['real_time']) stall_cycles = int(pc['1.2 Stall scoring time in HW']['real_time']) active_cycles = total_cycles - stall_cycles frequency = 10**6 if args.arch == 'CORE': frequency *= GNA_CORE_FREQUENCY else: frequency *= GNA_ATOM_FREQUENCY total_inference_time = total_cycles / frequency active_time = active_cycles / frequency stall_time = stall_cycles / frequency log.info('') log.info('Performance Statistics of GNA Hardware') log.info(f' Total Inference Time: {(total_inference_time * 1000):.4f} ms') log.info(f' Active Time: {(active_time * 1000):.4f} ms') log.info(f' Stall Time: {(stall_time * 1000):.4f} ms') log.info('') log.info(f'Total sample time: {total_infer_time * 1000:.2f}ms') if args.output: for i, blob_name in enumerate(results): write_utterance_file(output_files[i], results[blob_name]) log.info(f'File {output_files[i]} was created!') # ---------------------------------------------------------------------------------------------------------------------- log.info('This sample is an API example, ' 'for any performance measurements please use the dedicated benchmark_app tool\n') return 0
print('select infer requests (0 für synchron)') infer_req = int(input()) ie = IECore() net = IENetwork(model=model_xml, weights=model_bin) if infer_req == 0: # Synchron if os.path.isfile(exported_model): # found exported mode print('found model to import') exec_net = ie.import_network( model_file=exported_model, device_name='MYRIAD', num_requests=1) else: print('creating exec model') exec_net = ie.load_network( network=net, num_requests=1, device_name='MYRIAD') exec_net.export(exported_model) input_blob = None feed_dict = {} for blob_name in net.inputs: if len(net.inputs[blob_name].shape) == 4: input_blob = blob_name output_blop = next(iter(net.outputs)) n, c, h, w = net.inputs[input_blob].shape
def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = parse_args() # ---------------------------Step 1. Initialize inference engine core-------------------------------------------------- log.info('Creating Inference Engine') ie = IECore() # ---------------------------Step 2. Read a model in OpenVINO Intermediate Representation--------------- if args.model: log.info(f'Reading the network: {args.model}') # .xml and .bin files net = ie.read_network(model=args.model) # ---------------------------Step 3. Configure input & output---------------------------------------------------------- log.info('Configuring input and output blobs') # Get names of input and output blobs if args.input_layers: input_blobs = re.split(', |,', args.input_layers) else: input_blobs = [next(iter(net.input_info))] if args.output_layers: output_name_port = [ output.split(':') for output in re.split(', |,', args.output_layers) ] try: output_name_port = [(blob_name, int(port)) for blob_name, port in output_name_port] except ValueError: log.error('Output Parameter does not have a port.') sys.exit(-4) net.add_outputs(output_name_port) output_blobs = [blob_name for blob_name, port in output_name_port] else: output_blobs = [list(net.outputs.keys())[-1]] # Set input and output precision manually for blob_name in input_blobs: net.input_info[blob_name].precision = 'FP32' for blob_name in output_blobs: net.outputs[blob_name].precision = 'FP32' net.batch_size = args.batch_size # ---------------------------Step 4. Loading model to the device------------------------------------------------------- devices = args.device.replace('HETERO:', '').split(',') plugin_config = {} if 'GNA' in args.device: gna_device_mode = devices[0] if '_' in devices[0] else 'GNA_AUTO' devices[0] = 'GNA' plugin_config['GNA_DEVICE_MODE'] = gna_device_mode plugin_config['GNA_PRECISION'] = f'I{args.quantization_bits}' # Get a GNA scale factor if args.import_gna_model: log.info( f'Using scale factor from the imported GNA model: {args.import_gna_model}' ) else: utterances = read_utterance_file(args.input.split(',')[0]) key = sorted(utterances)[0] scale_factor = get_scale_factor(utterances[key]) log.info( f'Using scale factor of {scale_factor:.7f} calculated from first utterance.' ) plugin_config['GNA_SCALE_FACTOR'] = str(scale_factor) if args.export_embedded_gna_model: plugin_config[ 'GNA_FIRMWARE_MODEL_IMAGE'] = args.export_embedded_gna_model plugin_config[ 'GNA_FIRMWARE_MODEL_IMAGE_GENERATION'] = args.embedded_gna_configuration device_str = f'HETERO:{",".join(devices)}' if 'HETERO' in args.device else devices[ 0] log.info('Loading the model to the plugin') if args.model: exec_net = ie.load_network(net, device_str, plugin_config) else: exec_net = ie.import_network(args.import_gna_model, device_str, plugin_config) input_blobs = [next(iter(exec_net.input_info))] output_blobs = [list(exec_net.outputs.keys())[-1]] if args.input: input_files = re.split(', |,', args.input) if len(input_blobs) != len(input_files): log.error( f'Number of network inputs ({len(input_blobs)}) is not equal ' f'to number of ark files ({len(input_files)})') sys.exit(-3) if args.reference: reference_files = re.split(', |,', args.reference) if len(output_blobs) != len(reference_files): log.error( 'The number of reference files is not equal to the number of network outputs.' ) sys.exit(-5) if args.output: output_files = re.split(', |,', args.output) if len(output_blobs) != len(output_files): log.error( 'The number of output files is not equal to the number of network outputs.' ) sys.exit(-6) if args.export_gna_model: log.info(f'Writing GNA Model to {args.export_gna_model}') exec_net.export(args.export_gna_model) return 0 if args.export_embedded_gna_model: log.info( f'Exported GNA embedded model to file {args.export_embedded_gna_model}' ) log.info( f'GNA embedded model export done for GNA generation {args.embedded_gna_configuration}' ) return 0 # ---------------------------Step 5. Create infer request-------------------------------------------------------------- # load_network() method of the IECore class with a specified number of requests (default 1) returns an ExecutableNetwork # instance which stores infer requests. So you already created Infer requests in the previous step. # ---------------------------Step 6. Prepare input--------------------------------------------------------------------- file_data = [read_utterance_file(file_name) for file_name in input_files] input_data = { utterance_name: { input_blobs[i]: file_data[i][utterance_name] for i in range(len(input_blobs)) } for utterance_name in file_data[0].keys() } if args.reference: references = { output_blobs[i]: read_utterance_file(reference_files[i]) for i in range(len(output_blobs)) } # ---------------------------Step 7. Do inference---------------------------------------------------------------------- log.info('Starting inference in synchronous mode') results = {blob_name: {} for blob_name in output_blobs} infer_times = [] for key in sorted(input_data): start_infer_time = default_timer() # Reset states between utterance inferences to remove a memory impact for request in exec_net.requests: for state in request.query_state(): state.reset() result = infer_data(input_data[key], exec_net, input_blobs, output_blobs) for blob_name in result.keys(): results[blob_name][key] = result[blob_name] infer_times.append(default_timer() - start_infer_time) # ---------------------------Step 8. Process output-------------------------------------------------------------------- for blob_name in output_blobs: for i, key in enumerate(sorted(results[blob_name])): log.info(f'Utterance {i} ({key})') log.info(f'Output blob name: {blob_name}') log.info( f'Frames in utterance: {results[blob_name][key].shape[0]}') log.info( f'Total time in Infer (HW and SW): {infer_times[i] * 1000:.2f}ms' ) if args.reference: compare_with_reference(results[blob_name][key], references[blob_name][key]) log.info('') log.info(f'Total sample time: {sum(infer_times) * 1000:.2f}ms') if args.output: for i, blob_name in enumerate(results): write_utterance_file(output_files[i], results[blob_name]) log.info(f'File {output_files[i]} was created!') # ---------------------------------------------------------------------------------------------------------------------- log.info( 'This sample is an API example, ' 'for any performance measurements please use the dedicated benchmark_app tool\n' ) return 0