def __init__(self, data_location, subset, input_height, input_width, batch_size, num_cores, resize_method='crop', mean_value=[0.0,0.0,0.0], label_adjust=False): """dataloader generator Args: data_location (str): tf recorder local path subset (str): training or validation part input_height (int): input image size input_width (int): input image size batch_size (int): dataloader batch size num_cores (int): parallel resize_method (str, optional): data preprocession methods. Defaults to 'crop'. mean_value (list, optional): data mean value. Defaults to [0.0,0.0,0.0]. label_adjust (bool, optional): adjust the label value. Defaults to False. """ self.batch_size = batch_size self.subset = subset self.dataset = datasets.ImagenetData(data_location) self.total_image = self.dataset.num_examples_per_epoch(self.subset) self.preprocessor = self.dataset.get_image_preprocessor()( input_height, input_width, batch_size, num_cores, resize_method, mean_value) self.label_adjust = label_adjust self.n = int(self.total_image / self.batch_size)
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.compat.v1.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.compat.v1.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( INCEPTION_V3_IMAGE_SIZE, INCEPTION_V3_IMAGE_SIZE, self.args.batch_size, num_cores=self.args.num_cores, resize_method='bilinear') images, labels = preprocessor.minibatch(dataset, subset='validation') else: print("Inference with dummy data.") input_shape = [ self.args.batch_size, INCEPTION_V3_IMAGE_SIZE, INCEPTION_V3_IMAGE_SIZE, 3 ] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): graph_def = tf.compat.v1.GraphDef() with tf.compat.v1.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = optimize_for_inference( graph_def, [INPUTS], [OUTPUTS], dtypes.float32.as_datatype_enum, False) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') data_sess = tf.compat.v1.Session(graph=data_graph, config=data_config) infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = datasets.IMAGENET_NUM_VAL_IMAGES if (not self.args.accuracy_only): iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 data_load_start = time.time() image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() infer_sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time print('Iteration %d: %.6f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.6f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels = data_sess.run([images, labels]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum(input_tensor=tf.cast( tf.nn.in_top_k(predictions=tf.constant(predictions), targets=tf.constant(np_labels), k=1), tf.float32)) accuracy5 = tf.reduce_sum(input_tensor=tf.cast( tf.nn.in_top_k(predictions=tf.constant(predictions), targets=tf.constant(np_labels), k=5), tf.float32)) with tf.compat.v1.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run( [accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def __init__(self): self.model = FLAGS.model self.model_conf = model_config.get_model_config(self.model) self.trace_filename = FLAGS.trace_file self.data_format = FLAGS.data_format self.num_batches = FLAGS.num_batches autotune_threshold = FLAGS.autotune_threshold if ( FLAGS.autotune_threshold) else 1 min_autotune_warmup = 5 * autotune_threshold * autotune_threshold self.num_warmup_batches = FLAGS.num_warmup_batches if ( FLAGS.num_warmup_batches) else max(10, min_autotune_warmup) self.graph_file = FLAGS.graph_file self.resize_method = FLAGS.resize_method self.sync_queue_counter = 0 self.num_gpus = FLAGS.num_gpus # Use the batch size from the command line if specified, otherwise use the # model's default batch size. Scale the benchmark's batch size by the # number of GPUs. if FLAGS.batch_size > 0: self.model_conf.set_batch_size(FLAGS.batch_size) self.batch_size = self.model_conf.get_batch_size() * FLAGS.num_gpus # Use the learning rate from the command line if specified, otherwise use # the model's default learning rate, which must always be set. assert self.model_conf.get_learning_rate() > 0.0 if FLAGS.learning_rate is not None: self.model_conf.set_learning_rate(FLAGS.learning_rate) self.job_name = FLAGS.job_name # "" for local training self.ps_hosts = FLAGS.ps_hosts.split(',') self.worker_hosts = FLAGS.worker_hosts.split(',') self.dataset = None self.data_name = FLAGS.data_name if FLAGS.data_dir is not None: if self.data_name is None: if 'imagenet' in FLAGS.data_dir: self.data_name = 'imagenet' elif 'flowers' in FLAGS.data_dir: self.data_name = 'flowers' else: raise ValueError('Could not identify name of dataset. ' 'Please specify with --data_name option.') if self.data_name == 'imagenet': self.dataset = datasets.ImagenetData(FLAGS.data_dir) elif self.data_name == 'flowers': self.dataset = datasets.FlowersData(FLAGS.data_dir) else: raise ValueError( 'Unknown dataset. Must be one of imagenet or flowers.') self.local_parameter_device_flag = FLAGS.local_parameter_device if self.job_name: self.task_index = FLAGS.task_index self.cluster = tf.train.ClusterSpec({ 'ps': self.ps_hosts, 'worker': self.worker_hosts }) self.server = None if not self.server: self.server = tf.train.Server(self.cluster, job_name=self.job_name, task_index=self.task_index, config=create_config_proto(), protocol=FLAGS.server_protocol) worker_prefix = '/job:worker/task:%s' % self.task_index self.param_server_device = tf.train.replica_device_setter( worker_device=worker_prefix + '/cpu:0', cluster=self.cluster) # This device on which the queues for managing synchronization between # servers should be stored. num_ps = len(self.ps_hosts) self.sync_queue_devices = [ '/job:ps/task:%s/cpu:0' % i for i in range(num_ps) ] else: self.task_index = 0 self.cluster = None self.server = None worker_prefix = '' self.param_server_device = '/%s:0' % FLAGS.local_parameter_device self.sync_queue_devices = [self.param_server_device] # Device to use for ops that need to always run on the local worker's CPU. self.cpu_device = '%s/cpu:0' % worker_prefix # Device to use for ops that need to always run on the local worker's # compute device, and never on a parameter server device. self.raw_devices = [ '%s/%s:%i' % (worker_prefix, FLAGS.device, i) for i in xrange(FLAGS.num_gpus) ] if FLAGS.staged_vars and FLAGS.variable_update != 'parameter_server': raise ValueError('staged_vars for now is only supported with ' '--variable_update=parameter_server') if FLAGS.variable_update == 'parameter_server': if self.job_name: if not FLAGS.staged_vars: self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS( self) else: self.variable_mgr = ( variable_mgr.VariableMgrDistributedFetchFromStagedPS( self)) else: if not FLAGS.staged_vars: self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS( self) else: self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS( self) elif FLAGS.variable_update == 'replicated': if self.job_name: raise ValueError( 'Invalid --variable_update in distributed mode: %s' % FLAGS.variable_update) self.variable_mgr = variable_mgr.VariableMgrLocalReplicated( self, FLAGS.use_nccl) elif FLAGS.variable_update == 'distributed_replicated': if not self.job_name: raise ValueError( 'Invalid --variable_update in local mode: %s' % FLAGS.variable_update) self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated( self) elif FLAGS.variable_update == 'independent': if self.job_name: raise ValueError( 'Invalid --variable_update in distributed mode: %s' % FLAGS.variable_update) self.variable_mgr = variable_mgr.VariableMgrIndependent(self) else: raise ValueError('Invalid --variable_update: %s' % FLAGS.variable_update) # Device to use for running on the local worker's compute device, but # with variables assigned to parameter server devices. self.devices = self.variable_mgr.get_devices() if self.job_name: self.global_step_device = self.param_server_device else: self.global_step_device = self.cpu_device
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, intra_threads=self.args.num_intra_threads, resize_method='crop') images, labels = preprocessor.minibatch(dataset, subset='validation') else: print("Inference with dummy data.") input_shape = [ self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3 ] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): # convert the freezed graph to optimized graph graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph( graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') #output_tensor = infer_graph.get_tensor_by_name('resnet_v1_101/SpatialSqueeze:0') output_tensor = infer_graph.get_tensor_by_name( 'resnet_v1_101/predictions/Reshape_1:0') #tf.global_variables_initializer() data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = IMAGENET_VALIDATION_IMAGES if (not self.args.accuracy_only): # performance check iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 #options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 # Reads and preprocess data data_load_start = time.time() image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() infer_sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time #trace = timeline.Timeline(step_stats=run_metadata.step_stats) #with gfile.Open('resnet101_fp32_int8_master', 'w') as trace_file: # trace_file.write(trace.generate_chrome_trace_format(show_memory=False)) print('Iteration %d: %.3f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.3f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels = data_sess.run([images, labels]) np_labels -= 1 num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time with tf.Graph().as_default() as accu_graph: # Putting all code within this make things faster. accuracy1 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast( tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) with tf.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run( [accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = args.data_num_intra_threads data_config.inter_op_parallelism_threads = args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = num_intra_threads infer_config.inter_op_parallelism_threads = num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if args.data_location: print("inference with real data") # get the images from dataset dataset = datasets.ImagenetData(args.data_location) preprocessor = dataset.get_image_preprocessor(benchmark=True)( input_height, input_width, batch_size, num_cores=args.num_cores, resize_method='crop') images = preprocessor.minibatch(dataset, subset='validation') else: # synthetic images print("inference with dummy data") input_shape = [batch_size, input_height, input_width, 3] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32,
sys.exit("Please provide a graph file.") if args.input_height: input_height = args.input_height else: input_height = 224 if args.input_width: input_width = args.input_width else: input_width = 224 batch_size = args.batch_size input_layer = args.input_layer output_layer = args.output_layer num_inter_threads = args.num_inter_threads num_intra_threads = args.num_intra_threads data_location = args.data_location dataset = datasets.ImagenetData(data_location) preprocessor = dataset.get_image_preprocessor()( input_height, input_width, batch_size, 1, # device count tf.float32, # data_type for input fed to the graph train=False, # doing inference resize_method='bilinear') images, labels = preprocessor.minibatch(dataset, subset='train', use_datasets=True, cache_data=False) graph = load_graph(model_file) input_tensor = graph.get_tensor_by_name(input_layer + ":0")
def run(self): """run benchmark with optimized graph""" print("Run inference") data_config = tf.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.ConfigProto() infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") if self.args.calibrate: subset = 'calibration' else: subset = 'validation' dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, num_cores=self.args.num_cores, resize_method='crop') images, labels, filenames = preprocessor.minibatch(dataset, subset=subset) # If a results file path is provided, then start the prediction output file if self.args.results_file_path: with open(self.args.results_file_path, "w+") as fp: fp.write("filename,actual,prediction\n") else: print("Inference with dummy data.") input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') infer_graph = tf.Graph() with infer_graph.as_default(): graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name('input:0') output_tensor = infer_graph.get_tensor_by_name('predict:0') data_sess = tf.Session(graph=data_graph, config=data_config) infer_sess = tf.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \ if self.args.data_location else (self.args.batch_size * self.args.steps) if (not self.args.accuracy_only): iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 tf_filenames = None np_labels = None data_load_start = time.time() if self.args.results_file_path: image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) else: image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # Write out the file name, expected label, and top prediction self.write_results_output(predictions, tf_filenames, np_labels) # only add data loading time for real data, not for dummy data if self.args.data_location: time_consume += data_load_time print('Iteration %d: %.6f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.6f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data tf_filenames = None if self.args.results_file_path: np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames]) else: np_images, np_labels = data_sess.run([images, labels]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time # Write out the file name, expected label, and top prediction self.write_results_output(predictions, tf_filenames, np_labels) with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) with tf.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def run(self): """run benchmark with optimized graph""" with tf.Graph().as_default() as graph: config = tf.ConfigProto() config.allow_soft_placement = True config.intra_op_parallelism_threads = self.args.num_intra_threads config.inter_op_parallelism_threads = self.args.num_inter_threads with tf.Session(config=config) as sess: # convert the freezed graph to optimized graph graph_def = tf.GraphDef() with tf.gfile.FastGFile(self.args.input_graph, 'rb') as input_file: input_graph_content = input_file.read() graph_def.ParseFromString(input_graph_content) output_graph = graph_transforms.TransformGraph(graph_def, [INPUTS], [OUTPUTS], [OPTIMIZATION]) sess.graph.as_default() tf.import_graph_def(output_graph, name='') # Definite input and output Tensors for detection_graph input_tensor = graph.get_tensor_by_name('input:0') output_tensor = graph.get_tensor_by_name('predict:0') tf.global_variables_initializer() num_processed_images = 0 num_remaining_images = IMAGENET_VALIDATION_IMAGES if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = preprocessing.ImagePreprocessor( RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size, 1, # device count tf.float32, # data_type for input fed to the graph train=False, # doing inference resize_method='crop') images, labels, filenames = preprocessor.minibatch(dataset, subset='validation') num_remaining_images = dataset.num_examples_per_epoch(subset='validation') \ - num_processed_images else: print("Inference with dummy data.") input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') if (not self.args.accuracy_only): # performance check iteration = 0 warm_up_iteration = 10 total_run = 40 total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 # Reads and preprocess data if (self.args.data_location): preprocessed_images = sess.run([images[0]]) image_np = preprocessed_images[0] else: image_np = sess.run(images) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() (predicts) = sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time print('Iteration %d: %.3f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.3f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) # print throughput for both batch size 1 and 128 print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) # If a results file path is provided, then start the prediction output file if self.args.results_file_path: with open(self.args.results_file_path, "w+") as fp: fp.write("filename,actual,prediction\n") while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels, tf_filenames = sess.run( [images[0], labels[0], filenames[0]]) num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size # Compute inference on the preprocessed data predictions = sess.run(output_tensor, {input_tensor: np_images}) # Write out the file name, expected label, and top prediction if self.args.results_file_path: top_predictions = np.argmax(predictions, 1) with open(self.args.results_file_path, "a") as fp: for filename, expected_label, top_prediction in \ zip(tf_filenames, np_labels, top_predictions): fp.write("{},{},{}\n".format(filename, expected_label, top_prediction)) accuracy1 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 1), tf.float32)) accuracy5 = tf.reduce_sum( tf.cast(tf.nn.in_top_k(tf.constant(predictions), tf.constant(np_labels), 5), tf.float32)) np_accuracy1, np_accuracy5 = sess.run([accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images))
def eval_inference(self, infer_graph): """run benchmark with optimized graph""" print("Run inference") data_config = tf.compat.v1.ConfigProto() data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads data_config.use_per_session_threads = 1 infer_config = tf.compat.v1.ConfigProto() if self.args.env == 'mkl': print("Set inter and intra for mkl") infer_config.intra_op_parallelism_threads = self.args.num_intra_threads infer_config.inter_op_parallelism_threads = self.args.num_inter_threads infer_config.use_per_session_threads = 1 data_graph = tf.Graph() with data_graph.as_default(): if (self.args.data_location): print("Inference with real data.") dataset = datasets.ImagenetData(self.args.data_location) preprocessor = dataset.get_image_preprocessor()( self.args.image_size, self.args.image_size, self.args.batch_size, num_cores=self.args.num_cores, resize_method=self.args.resize_method, mean_value=[ self.args.r_mean, self.args.g_mean, self.args.b_mean ], scale=self.args.scale) images, labels = preprocessor.minibatch(dataset, subset='validation') else: print("Inference with dummy data.") input_shape = [ self.args.batch_size, self.args.image_size, self.args.image_size, 3 ] images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images') # Definite input and output Tensors for detection_graph input_tensor = infer_graph.get_tensor_by_name(self.args.input + ':0') output_tensor = infer_graph.get_tensor_by_name(self.args.output + ':0') data_sess = tf.compat.v1.Session(graph=data_graph, config=data_config) infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config) num_processed_images = 0 num_remaining_images = datasets.IMAGENET_NUM_VAL_IMAGES if (not self.args.accuracy_only): iteration = 0 warm_up_iteration = self.args.warmup_steps total_run = self.args.steps total_time = 0 while num_remaining_images >= self.args.batch_size and iteration < total_run: iteration += 1 data_load_start = time.time() image_np = data_sess.run(images) data_load_time = time.time() - data_load_start num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() infer_sess.run([output_tensor], feed_dict={input_tensor: image_np}) time_consume = time.time() - start_time # # only add data loading time for real data, not for dummy data # if self.args.data_location: # time_consume += data_load_time print('Iteration %d: %.6f sec' % (iteration, time_consume)) if iteration > warm_up_iteration: total_time += time_consume time_average = total_time / (iteration - warm_up_iteration) print('Average time: %.6f sec' % (time_average)) print('Batch size = %d' % self.args.batch_size) if (self.args.batch_size == 1): print('Latency: %.3f ms' % (time_average * 1000)) print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average)) else: # accuracy check total_accuracy1, total_accuracy5 = (0.0, 0.0) while num_remaining_images >= self.args.batch_size: # Reads and preprocess data np_images, np_labels = data_sess.run([images, labels]) if self.args.label_adjust: np_labels -= 1 num_processed_images += self.args.batch_size num_remaining_images -= self.args.batch_size start_time = time.time() # Compute inference on the preprocessed data predictions = infer_sess.run(output_tensor, {input_tensor: np_images}) elapsed_time = time.time() - start_time # DenseNet output dim reshape (4 -> 2) for accuracy test if len(predictions.shape) > 2: predictions = predictions.reshape(predictions.shape[0], -1) with tf.Graph().as_default() as accu_graph: accuracy1 = tf.reduce_sum(input_tensor=tf.cast( tf.nn.in_top_k(predictions=tf.constant(predictions), targets=tf.constant(np_labels), k=1), tf.float32)) accuracy5 = tf.reduce_sum(input_tensor=tf.cast( tf.nn.in_top_k(predictions=tf.constant(predictions), targets=tf.constant(np_labels), k=5), tf.float32)) with tf.compat.v1.Session() as accu_sess: np_accuracy1, np_accuracy5 = accu_sess.run( [accuracy1, accuracy5]) total_accuracy1 += np_accuracy1 total_accuracy5 += np_accuracy5 print("Iteration time: %0.4f ms" % elapsed_time) print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \ % (num_processed_images, total_accuracy1 / num_processed_images, total_accuracy5 / num_processed_images)) print("Accuracy: %.5f" % (total_accuracy1 / num_processed_images))