def test_linearize(): nodes = make_caterpillar_graph(5) linearize.linearize() sess = create_session() import memory_util memory_util.vlog(1) with memory_util.capture_stderr() as stderr: sess.run(nodes[-1].op) memory_util.print_memory_timeline(stderr, ignore_less_than_bytes=1000)
def run_and_analyze(in_shape): with memory_util.capture_stderr() as stderr: res = sess.run(y, feed_dict={x: np.random.randn(*in_shape)}) print res.shape expected_mem = reduce(lambda i, j: i * j, in_shape) # inputs expected_mem += (kernel_size**2) + in_channels * out_channels # weights expected_mem += reduce(lambda i, j: i * j, res.shape) # outputs expected_mem *= 4 # 4 bytes per float peak_mem = memory_util.peak_memory(stderr) print 'expected mem usage (MB): ', expected_mem / BYTES_PER_MB print 'peak mem usage (MB): ', peak_mem / BYTES_PER_MB print 'peak:expected mem ratio: ', peak_mem / float(expected_mem) print memory_util.print_memory_timeline(stderr) memory_util.plot_memory_timeline(plt, stderr) import ipdb ipdb.set_trace()
config = tf.ConfigProto( log_device_placement=False, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0))) return tf.InteractiveSession(config=config) node_mbs = 1 length = 5 dtype = np.float32 n = node_mbs * 250000 a0_ = tf.ones((n, ), dtype=dtype) a0 = tf.Variable(a0_, name="a0") a = a0 for i in range(1, length): name = "a" + str(i) a = tf.tanh(a, name=name) grad = tf.gradients([a], [a0])[0] sess = create_session() sess.run(tf.global_variables_initializer()) with memory_util.capture_stderr() as stderr: sess.run(grad.op) peak_memory = memory_util.peak_memory(stderr) memory_util.print_memory_timeline(stderr) print("Peak memory: %d" % (peak_memory, ))
def main(): import memory_util memory_util.vlog(1) # vlog=2 on GPU machine will spam gpu "polling" msgs tf.reset_default_graph() n = 3 # TODO: fix edge case with n=2 nodes = make_chain_tanh(n) a0 = nodes[0] a = nodes[-1] #grad = memory_saving_gradients.gradients_memory([a], [a0])[0] grad = tf.gradients(a, [a0])[0] sess = create_session() sess.run(tf.global_variables_initializer()) # feed_dict = {a0, with memory_util.capture_stderr() as stderr: sess.run(grad.op) peak_memory1 = memory_util.peak_memory(stderr.getvalue()) # 20 mem used with following tensors picked automatically as bottlenecks # ['a10:0', 'a19:0', 'a28:0', 'a37:0', 'a46:0', 'a55:0', 'a64:0', 'a73:0', # 'a82:0', 'a91:0'] # method 2 mem_op = tf.contrib.memory_stats.MaxBytesInUse() peak_memory2 = sess.run(mem_op) # method 3 run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) sess.run(grad.op, run_metadata=run_metadata, options=run_options,) print(run_metadata) peak_memory3 = memory_util.peak_from_metadata(run_metadata)['gpu'] print(peak_memory1, "VLOG_MEMORY") print(peak_memory2, "MaxBytesInUse") print(peak_memory3, "metadata") cpu,gpu=memory_util._retrieve_cpu_gpu_stats(run_metadata) if cpu: bytes_in_use_cpu = [node.memory[0].allocator_bytes_in_use for node in cpu] if gpu: bytes_in_use_gpu = [node.memory[0].allocator_bytes_in_use for node in gpu] peak_memory4 = max(bytes_in_use_gpu) print(peak_memory4, "metadata max") # fourth way would parse "allocator_bytes_in_use # node_stats { # node_name: "Square" # all_start_micros: 1509664297214870 # op_start_rel_micros: 4 # op_end_rel_micros: 115 # all_end_rel_micros: 136 # memory { # allocator_name: "GPU_0_bfc" # allocator_bytes_in_use: 6013952 # } expected_peak = 3 * 10**6 util.report_memory(peak_memory1, expected_peak) assert abs(peak_memory3 - expected_peak) < 10000, "Difference too large."
def main(): import memory_util memory_util.vlog(1) # vlog=2 on GPU machine will spam gpu "polling" msgs tf.reset_default_graph() n = 3 # TODO: fix edge case with n=2 nodes = make_chain_tanh(n) a0 = nodes[0] a = nodes[-1] #grad = memory_saving_gradients.gradients_memory([a], [a0])[0] grad = tf.gradients(a, [a0])[0] sess = create_session() sess.run(tf.global_variables_initializer()) # feed_dict = {a0, with memory_util.capture_stderr() as stderr: sess.run(grad.op) peak_memory1 = memory_util.peak_memory(stderr.getvalue()) # 20 mem used with following tensors picked automatically as bottlenecks # ['a10:0', 'a19:0', 'a28:0', 'a37:0', 'a46:0', 'a55:0', 'a64:0', 'a73:0', # 'a82:0', 'a91:0'] # method 2 mem_op = tf.contrib.memory_stats.MaxBytesInUse() peak_memory2 = sess.run(mem_op) # method 3 run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) sess.run( grad.op, run_metadata=run_metadata, options=run_options, ) print(run_metadata) peak_memory3 = memory_util.peak_from_metadata(run_metadata)['gpu'] print(peak_memory1, "VLOG_MEMORY") print(peak_memory2, "MaxBytesInUse") print(peak_memory3, "metadata") cpu, gpu = memory_util._retrieve_cpu_gpu_stats(run_metadata) if cpu: bytes_in_use_cpu = [ node.memory[0].allocator_bytes_in_use for node in cpu ] if gpu: bytes_in_use_gpu = [ node.memory[0].allocator_bytes_in_use for node in gpu ] peak_memory4 = max(bytes_in_use_gpu) print(peak_memory4, "metadata max") # fourth way would parse "allocator_bytes_in_use # node_stats { # node_name: "Square" # all_start_micros: 1509664297214870 # op_start_rel_micros: 4 # op_end_rel_micros: 115 # all_end_rel_micros: 136 # memory { # allocator_name: "GPU_0_bfc" # allocator_bytes_in_use: 6013952 # } expected_peak = 3 * 10**6 util.report_memory(peak_memory1, expected_peak) assert abs(peak_memory3 - expected_peak) < 10000, "Difference too large."