def bench_v1(n: int): times = [] tf.reset_default_graph() with tf.device("/%s:0" % (_ARGS_DEVICE)): matrix1 = tf.Variable(tf.ones((n, n), dtype=_ARGS_DTYPE)) matrix2 = tf.Variable(tf.ones((n, n), dtype=_ARGS_DTYPE)) product = tf.matmul(matrix1, matrix2) config = tf.ConfigProto() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) # warmup sess.run(product.op) for i in range(_ARGS_REPS): start = time.monotonic() for j in range(_ARGS_MINI_BATCH): sess.run(product.op) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) ops = n**3 + ( n - 1 ) * n**2 * _ARGS_MINI_BATCH # n^2*(n-1) additions, n^3 multiplications rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('%d x %d matmul took: \t%.4f ms,\t %.2f GFLOPS' % ( n, n, elapsed_ms, rate, ), file=sys.stderr) return rate, elapsed_ms
def bench(n): if _ARGS_DEVICE == 'gpu': if torch.cuda.is_available(): number_GPU = torch.cuda.device_count() current_device = torch.cuda.current_device() torch.cuda.device(current_device) name_GPU = torch.cuda.get_device_name(current_device) device = torch.device('cuda') else: raise Exception("No GPU available") else: device = torch.device('cpu') matrix1 = torch.ones((n, n), dtype=_ARGS_DTYPE_MAP[_ARGS_DTYPE]["dtype"], device=device) matrix2 = torch.ones((n, n), dtype=_ARGS_DTYPE_MAP[_ARGS_DTYPE]["dtype"], device=device) times = [] for i in range(_ARGS_REPS): start = time.monotonic() product = torch.mm(matrix1, matrix2) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) ops = n ** 3 + (n - 1) * n ** 2 # n^2*(n-1) additions, n^3 multiplications rate = ops / elapsed_ms / 10 ** 6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('%d x %d matmul took: \t%.4f ms,\t %.2f GFLOPS' % (n, n, elapsed_ms, rate,), file=sys.stderr) return rate, elapsed_ms
def bench_v2(n: int): times = [] with tf.device("/%s:0" % (_ARGS_DEVICE)): matrix1 = tf.Variable(tf.ones((n, n), dtype=_ARGS_DTYPE)) matrix2 = tf.Variable(tf.ones((n, n), dtype=_ARGS_DTYPE)) for i in range(_ARGS_REPS): start = time.monotonic() for j in range(_ARGS_MINI_BATCH): product = tf.matmul(matrix1, matrix2) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) ops = n**3 + ( n - 1 ) * n**2 * _ARGS_MINI_BATCH # n^2*(n-1) additions, n^3 multiplications rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('%d x %d matmul took: \t%.4f ms,\t %.2f GFLOPS' % ( n, n, elapsed_ms, rate, ), file=sys.stderr) return rate, elapsed_ms
def bench( batch: int, tensor_input_height: int, tensor_input_width: int, tensor_input_channels: int, tensor_output_channels: int, filter_height: int, filter_width: int, ): if _ARGS_DEVICE == "gpu": if torch.cuda.is_available(): number_GPU = torch.cuda.device_count() current_device = torch.cuda.current_device() torch.cuda.device(current_device) name_GPU = torch.cuda.get_device_name(current_device) device = torch.device("cuda") else: raise Exception("No GPU available") else: device = torch.device("cpu") input_tensor = torch.ones( (batch, tensor_input_channels, tensor_input_height, tensor_input_width), dtype=_ARGS_DTYPE_MAP[_ARGS_DTYPE]["dtype"], device=device, ) convolution = torch.nn.Conv2d( in_channels=tensor_input_channels, out_channels=tensor_output_channels, kernel_size=(filter_height, filter_width), stride=(_ARGS_STRIDES, _ARGS_STRIDES), padding=(_ARGS_PADDING, _ARGS_PADDING), ) times = [] for i in range(_ARGS_REPS): start = time.monotonic() convolution(input_tensor) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) # Source: # https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/python/profiler/internal/flops_registry.py#L381 # Formula: # batch_size * image_x_dim * image_y_dim * kernel_x_dim * kernel_y_dim # * input_depth * output_depth * 2 / (image_x_stride * image_x_stride) ops = (batch * tensor_input_height * tensor_input_width * filter_height * filter_width * tensor_input_channels * tensor_output_channels * 2) / (_ARGS_STRIDES * _ARGS_STRIDES) rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print("conv took: \t%.4f ms,\t %.2f GFLOPS" % (elapsed_ms, rate), file=sys.stderr) return rate, elapsed_ms
def bench( batch: int, tensor_input_width: int, tensor_input_channels: int, filter_width: int, filter_input_channels: int, filter_output_channels: int, ): if _ARGS_DEVICE == 'gpu': if torch.cuda.is_available(): number_GPU = torch.cuda.device_count() current_device = torch.cuda.current_device() torch.cuda.device(current_device) name_GPU = torch.cuda.get_device_name(current_device) device = torch.device('cuda') else: raise Exception("No GPU available") else: device = torch.device('cpu') input_tensor = torch.ones( (batch, tensor_input_channels, tensor_input_width), dtype=_ARGS_DTYPE_MAP[_ARGS_DTYPE]["dtype"], device=device ) convolution = torch.nn.Conv1d( in_channels=tensor_input_channels, out_channels=filter_output_channels, kernel_size=(filter_width), stride=_ARGS_STRIDE, padding=_ARGS_PADDING) times = [] for i in range(_ARGS_REPS): start = time.monotonic() convolution(input_tensor) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) # Formula: # batch_size * x_dim * kernel_x_dim # * input_depth * output_depth * 2 / (x_stride) ops = ( batch * tensor_input_width * filter_width * tensor_input_channels * filter_output_channels * 2 ) / (_ARGS_STRIDE) rate = ops / elapsed_ms / 10 ** 6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('conv took: \t%.4f ms,\t %.2f GFLOPS' % (elapsed_ms, rate), file=sys.stderr) return rate, elapsed_ms
def bench_v2(): """Use v2 API for printing hello world.""" start = time.monotonic() for i in range(_ARGS_REPS): hello = tf.constant("Hello, TensorFlow by Thoth!") tf.print(hello, output_stream=sys.stderr) del hello return time.monotonic() - start
def bench_v1(): """Use v1 API for printing hello world.""" start = time.monotonic() for i in range(_ARGS_REPS): hello = tf.constant("Hello, TensorFlow by Thoth!") sess = tf.Session() print(sess.run(hello), file=sys.stderr) del hello del sess return time.monotonic() - start
def bench_v1(batch: int, tensor_input_height: int, tensor_input_width: int, tensor_input_channels: int, filter_height: int, filter_width: int, filter_input_channels: int, filter_output_channels: int): times = [] with tf.device("/%s:0" % (_ARGS_DEVICE)): init_tensor, stride = create_initial_tensor( batch=batch, tensor_input_height=tensor_input_height, tensor_input_width=tensor_input_width, tensor_input_channels=tensor_input_channels) init_filter = tf.Variable( tf.ones([ filter_height, filter_width, filter_input_channels, filter_output_channels, ]), dtype=_ARGS_DTYPE, ) convolution = tf.nn.conv2d( init_tensor, filter=init_filter, strides=stride, padding=_ARGS_PADDING, data_format=_ARGS_DATA_FORMAT, ) config = tf.ConfigProto() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) # warmup sess.run(convolution.op) for i in range(_ARGS_REPS): start = time.monotonic() sess.run(convolution.op) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) # Source: https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/python/profiler/internal/flops_registry.py#L381 # Formula: # batch_size * image_x_dim * image_y_dim * kernel_x_dim * kernel_y_dim # * input_depth * output_depth * 2 / (image_x_stride * image_x_stride) ops = (batch * tensor_input_height * tensor_input_width * filter_height * filter_width * tensor_input_channels * filter_output_channels * 2) / (_ARGS_STRIDES * _ARGS_STRIDES) rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('conv took: \t%.4f ms,\t %.2f GFLOPS' % (elapsed_ms, rate), file=sys.stderr) return rate, elapsed_ms
def bench_v1(batch: int, tensor_input_width: int, tensor_input_channels: int, filter_width: int, filter_input_channels: int, filter_output_channels: int): times = [] tf.reset_default_graph() with tf.device("/%s:0" % (_ARGS_DEVICE)): init_tensor = create_initial_tensor( batch=batch, tensor_input_width=tensor_input_width, tensor_input_channels=tensor_input_channels) init_filter = tf.Variable( tf.ones([ filter_width, filter_input_channels, filter_output_channels, ]), dtype=_ARGS_DTYPE, ) convolution = tf.nn.conv1d( init_tensor, filters=init_filter, stride=_ARGS_STRIDE, padding=_ARGS_PADDING, data_format=_ARGS_DATA_FORMAT, ) config = tf.ConfigProto() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) # warmup sess.run(convolution.op) for i in range(_ARGS_REPS): start = time.monotonic() for j in range(_ARGS_MINI_BATCH): sess.run(convolution.op) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) # Formula: # batch_size * x_dim * kernel_x_dim # * input_depth * output_depth * 2 / (x_stride) ops = (batch * tensor_input_width * filter_width * tensor_input_channels * filter_output_channels * _ARGS_MINI_BATCH * 2) / _ARGS_STRIDE rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('conv took: \t%.4f ms,\t %.2f GFLOPS' % (elapsed_ms, rate), file=sys.stderr) return rate, elapsed_ms
def bench_v2(batch: int, tensor_input_width: int, tensor_input_channels: int, filter_width: int, filter_input_channels: int, filter_output_channels: int): times = [] with tf.device("/%s:0" % (_ARGS_DEVICE)): init_tensor = create_initial_tensor( batch=batch, tensor_input_width=tensor_input_width, tensor_input_channels=tensor_input_channels) init_filter = tf.Variable( tf.ones([ filter_width, filter_input_channels, filter_output_channels, ]), dtype=_ARGS_DTYPE, ) for i in range(_ARGS_REPS): start = time.monotonic() tf.nn.conv1d( init_tensor, filters=init_filter, stride=_ARGS_STRIDE, padding=_ARGS_PADDING, data_format=_ARGS_DATA_FORMAT, ) times.append(time.monotonic() - start) times_ms = 1000 * np.array(times) # in seconds, convert to ms elapsed_ms = np.median(times_ms) # Formula: # batch_size * x_dim * kernel_x_dim # * input_depth * output_depth * 2 / (x_stride) ops = (batch * tensor_input_width * filter_width * tensor_input_channels * filter_output_channels * 2) / (_ARGS_STRIDE) rate = ops / elapsed_ms / 10**6 # in GFLOPS. (/ milli / 10**6) == (/ 10 ** 9) print('conv took: \t%.4f ms,\t %.2f GFLOPS' % (elapsed_ms, rate), file=sys.stderr) return rate, elapsed_ms
def main(): """Main entrypoint.""" start = time.monotonic() import tensorflow as tf end = time.monotonic() tf_version = tf.__version__ print("# Version: %s, path: %s" % (tf_version, tf.__path__), file=sys.stderr) result = { "component": "tensorflow", "name": "PiImport", "@parameters": {}, "@result": { "elapsed": end - start, }, "tensorflow_aicoe_buildinfo": _get_aicoe_tensorflow_build_info(tf), "tensorflow_upstream_buildinfo": _get_tensorflow_build_info(tf), } json.dump(result, sys.stdout, indent=2, sort_keys=True)