def _check_devices(self): from TFUtil import print_available_devices, is_gpu_available print_available_devices() assert len(self.devices_config) == 1, "multiple devices not supported yet for TF" if self.is_requesting_for_gpu(): assert is_gpu_available(), "no GPU available" else: if is_gpu_available(): print("Note: There is a GPU available but you have set device=cpu.", file=log.v2)
def main(): global LstmCellTypes print("Benchmarking LSTMs.") better_exchook.install() print("Args:", " ".join(sys.argv)) arg_parser = ArgumentParser() arg_parser.add_argument("cfg", nargs="*", help="opt=value, opt in %r" % sorted(base_settings.keys())) arg_parser.add_argument("--no-cpu", action="store_true") arg_parser.add_argument("--no-gpu", action="store_true") arg_parser.add_argument("--selected", help="comma-separated list from %r" % LstmCellTypes) arg_parser.add_argument("--no-setup-tf-thread-pools", action="store_true") args = arg_parser.parse_args() for opt in args.cfg: key, value = opt.split("=", 1) assert key in base_settings value_type = type(base_settings[key]) base_settings[key] = value_type(value) print("Settings:") pprint(base_settings) log.initialize(verbosity=[4]) print("Returnn:", describe_crnn_version(), file=log.v3) print("TensorFlow:", describe_tensorflow_version(), file=log.v3) print("Python:", sys.version.replace("\n", ""), sys.platform) if not args.no_setup_tf_thread_pools: setup_tf_thread_pools(log_file=log.v2) else: print("Not setting up the TF thread pools. Will be done automatically by TF to number of CPU cores.") if args.no_gpu: print("GPU will not be used.") else: print("GPU available: %r" % is_gpu_available()) print_available_devices() if args.selected: LstmCellTypes = args.selected.split(",") benchmarks = {} if not args.no_gpu and is_gpu_available(): for lstm_unit in LstmCellTypes: benchmarks["GPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=True) if not args.no_cpu: for lstm_unit in LstmCellTypes: if lstm_unit in GpuOnlyCellTypes: continue benchmarks["CPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=False) print("-" * 20) print("Settings:") pprint(base_settings) print("Final results:") for t, lstm_unit in sorted([(t, lstm_unit) for (lstm_unit, t) in sorted(benchmarks.items())]): print(" %s: %s" % (lstm_unit, hms_fraction(t))) print("Done.")
def main(): global LstmCellTypes print("Benchmarking LSTMs.") better_exchook.install() print("Args:", " ".join(sys.argv)) arg_parser = ArgumentParser() arg_parser.add_argument("cfg", nargs="*", help="opt=value, opt in %r" % sorted(base_settings.keys())) arg_parser.add_argument("--no-cpu", action="store_true") arg_parser.add_argument("--no-gpu", action="store_true") arg_parser.add_argument("--selected", help="comma-separated list from %r" % LstmCellTypes) arg_parser.add_argument("--no-setup-tf-thread-pools", action="store_true") args = arg_parser.parse_args() for opt in args.cfg: key, value = opt.split("=", 1) assert key in base_settings value_type = type(base_settings[key]) base_settings[key] = value_type(value) print("Settings:") pprint(base_settings) log.initialize(verbosity=[4]) print("Returnn:", describe_returnn_version(), file=log.v3) print("TensorFlow:", describe_tensorflow_version(), file=log.v3) print("Python:", sys.version.replace("\n", ""), sys.platform) if not args.no_setup_tf_thread_pools: setup_tf_thread_pools(log_file=log.v2) else: print("Not setting up the TF thread pools. Will be done automatically by TF to number of CPU cores.") if args.no_gpu: print("GPU will not be used.") else: print("GPU available: %r" % is_gpu_available()) print_available_devices() if args.selected: LstmCellTypes = args.selected.split(",") benchmarks = {} if not args.no_gpu and is_gpu_available(): for lstm_unit in LstmCellTypes: benchmarks["GPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=True) if not args.no_cpu: for lstm_unit in LstmCellTypes: if lstm_unit in GpuOnlyCellTypes: continue benchmarks["CPU:" + lstm_unit] = benchmark(lstm_unit=lstm_unit, use_gpu=False) print("-" * 20) print("Settings:") pprint(base_settings) print("Final results:") for t, lstm_unit in sorted([(t, lstm_unit) for (lstm_unit, t) in sorted(benchmarks.items())]): print(" %s: %s" % (lstm_unit, hms_fraction(t))) print("Done.")
def _get_devices_config(self): """ :rtype: list[dict[str]] """ from Device import getDevicesInitArgs if not self.config.value("device", None): # Better default: Use GPU if available. from TFUtil import is_gpu_available if is_gpu_available(): print("Device not set explicitly, and we found a GPU, which we will use.", file=log.v2) self.config.set("device", "gpu") else: print("Device not set explicitly, and no GPU found.", file=log.v2) return getDevicesInitArgs(self.config)
def test_GradOfLstmGenericBase_simple_nan(): print("test_GradOfLstmGenericBase_simple_nan()") print("GPU available:", is_gpu_available()) print("Create LSTM op...") from TFNativeOp import make_lstm_op op_func = make_lstm_op(compiler_opts=dict(verbose=True)) print("op_func:", op_func) def dummy_call(): n_time = 1 n_batch = 1 n_out = 1 Z = tf.zeros((n_time, n_batch, n_out * 4)) V_h = tf.zeros((n_out, n_out * 4)) c = tf.zeros((n_batch, n_out)) i = tf.ones((n_time, n_batch)) return op_func(Z, V_h, c, i) dummy = dummy_call() with tf.Session() as session: print("dummy out:", session.run(list(dummy))) grad_op = _lstm_grad_op(session) args = _demo_lstm_grad_args() placeholders = [tf.placeholder(v.dtype) for v in args] lstm_grad_t = list(grad_op(*placeholders)) for kwargs in [ {} ]: # [{"factor": 0}, {"ones_like": True}, {"ones_like": True, "factor": -1}, {}]: print("Testing lstm grad args %r." % kwargs) args = _demo_lstm_grad_args(**kwargs) outs = session.run(lstm_grad_t, feed_dict=dict(zip(placeholders, args))) for out, descr, i in zip(outs, ["z", "out_v_h", "out_c", "dummy_out"], range(4)): assert isinstance(out, numpy.ndarray) print("(%i) %s:" % (i, descr)) print(out) for out in outs: assert numpy.all(numpy.isfinite(out)) print("Seems ok.") print("All ok!")
print(expected) fsa = Fsa.fast_bw_fsa_staircase(seq_lens=[num_classes], with_loop=with_loop) with TFCompat.v1.Session().as_default(): res = tf_baum_welch(fsa, num_classes=num_classes, out_seq_len=out_seq_len) print("baum-welch:") print(res) is_close = numpy.isclose(expected, res).all() print("close:", is_close) assert is_close # Note: we could replace tf_baum_welch by some CPU/Python code... @unittest.skipIf(not is_gpu_available(), "no gpu on this system; needed for tf_baum_welch") def test_fast_bw_fsa_staircase(): check_fast_bw_fsa_staircase(2, 2, with_loop=False) check_fast_bw_fsa_staircase(2, 2, with_loop=True) check_fast_bw_fsa_staircase(3, 2, with_loop=False) check_fast_bw_fsa_staircase(3, 2, with_loop=True) check_fast_bw_fsa_staircase(3, 3, with_loop=False) check_fast_bw_fsa_staircase(3, 3, with_loop=True) if __name__ == "__main__": import better_exchook better_exchook.install() if len(sys.argv) <= 1: for k, v in sorted(globals().items()):
config = Config() config.update({ "num_outputs": 3, "num_inputs": 4, "network": { "output": {"class": "rec", "target": "classes", "unit": { "prob": {"class": "softmax", "from": ["prev:output"], "loss": "ce", "target": "classes"}, "output": {"class": "choice", "beam_size": 4, "from": ["prob"], "target": "classes", "initial_output": 0} }}, } }) network = TFNetwork(config=config, train_flag=True) network.construct_from_dict(config.typed_dict["network"]) @unittest.skipIf(not is_gpu_available(), "no gpu on this system") def test_RecLayer_get_cudnn_params_size(): from tensorflow.contrib.cudnn_rnn.ops.gen_cudnn_rnn_ops import cudnn_rnn_params_size def check(num_units, input_size, rnn_mode="lstm", num_layers=1, direction="unidirectional", input_mode="linear_input", T=tf.float32, S=tf.int32): common_kwargs = dict( rnn_mode=rnn_mode, num_units=num_units, input_size=input_size, num_layers=num_layers, direction=direction, input_mode=input_mode) cu_size = cudnn_rnn_params_size(T=T, S=S, **common_kwargs)[0] my_size = RecLayer._get_cudnn_param_size(**common_kwargs) assert_equal(cu_size.eval(), my_size) with tf.Session(): check(rnn_mode="lstm", num_units=5, input_size=3)
for t in range(n_time): ta = ta.write(index=t, value=x[t]) y = ta.stack() y.set_shape(tf.TensorShape((n_time, n_dim))) # y = y[::1] -- if you add this, the test passes dx, = tf.gradients(ys=[y], grad_ys=[dy], xs=[x]) vx, vdy, vy, vdx = session.run([x, dy, y, dx]) print("x:", vx) print("y:", vy) print("dy:", vdy) print("dx:", vdx) assert_allclose(vx, vy) assert_allclose(vdy, vdx) @unittest.skipIf(not is_gpu_available(), "no gpu on this system") def test_FastBaumWelch(): print("Make op...") op = make_fast_baum_welch_op(compiler_opts=dict(verbose=True)) # will be cached, used inside :func:`fast_baum_welch` print("Op:", op) n_batch = 3 seq_len = 5 n_classes = 10 from Fsa import FastBwFsaShared fsa = FastBwFsaShared() fsa.add_inf_loop(state_idx=0, num_emission_labels=n_classes) fast_bw_fsa = fsa.get_fast_bw_fsa(n_batch=n_batch) edges = tf.constant(fast_bw_fsa.edges, dtype=tf.int32) weights = tf.constant(fast_bw_fsa.weights, dtype=tf.float32) start_end_states = tf.constant(fast_bw_fsa.start_end_states, dtype=tf.int32) am_scores = tf.constant(numpy.random.normal(size=(seq_len, n_batch, n_classes)), dtype=tf.float32) # in -log space
print("check_fast_bw_fsa_staircase(%i, %i, with_loop=%r)" % (num_classes, out_seq_len, with_loop)) expected = slow_full_sum_staircase_uniform(num_classes=num_classes, out_seq_len=out_seq_len, with_loop=with_loop) print("expected full sum:") print(expected) fsa = Fsa.fast_bw_fsa_staircase(seq_lens=[num_classes], with_loop=with_loop) with tf.Session().as_default(): res = tf_baum_welch(fsa, num_classes=num_classes, out_seq_len=out_seq_len) print("baum-welch:") print(res) is_close = numpy.isclose(expected, res).all() print("close:", is_close) assert is_close # Note: we could replace tf_baum_welch by some CPU/Python code... @unittest.skipIf(not is_gpu_available(), "no gpu on this system; needed for tf_baum_welch") def test_fast_bw_fsa_staircase(): check_fast_bw_fsa_staircase(2, 2, with_loop=False) check_fast_bw_fsa_staircase(2, 2, with_loop=True) check_fast_bw_fsa_staircase(3, 2, with_loop=False) check_fast_bw_fsa_staircase(3, 2, with_loop=True) check_fast_bw_fsa_staircase(3, 3, with_loop=False) check_fast_bw_fsa_staircase(3, 3, with_loop=True) if __name__ == "__main__": import better_exchook better_exchook.install() if len(sys.argv) <= 1: for k, v in sorted(globals().items()): if k.startswith("test_"):
def test_RecLayer_NativeLstm_Nan(): print("test_RecLayer_NativeLstm_Nan()") print("GPU available:", is_gpu_available()) numpy.set_printoptions(precision=15) num_inputs = 4 num_outputs = 3 config = Config() config.update({ "num_inputs": num_inputs, "num_outputs": { "data": [num_inputs, 2], "classes": [num_outputs, 2] }, # dense output "network": { "output": { "class": "rec", "unit": "NativeLSTM", "loss": "mse" } }, "adam": True, "debug_grad_summaries": True, "debug_save_updater_vars": True, "debug_add_check_numerics_ops": True, }) print("Reset default graph...") tf.reset_default_graph() print("Create network...") network = TFNetwork(config=config, train_flag=True) network.construct_from_dict(config.typed_dict["network"]) # Depending on the seed, I get nan earlier, later, or not at all. # limit=5.0: seed=3 -> nan in step 4094. seed=1 -> nan in step 2463. random = numpy.random.RandomState(seed=1) limit = 10.0 # The higher, the more likely you get nan. def make_feed_dict(seq_len=10): return { network.extern_data.data["data"].placeholder: random.uniform(-limit, limit, (1, seq_len, num_inputs)), network.extern_data.data["data"].size_placeholder[0]: numpy.array([seq_len]), network.extern_data.data["classes"].placeholder: random.uniform(-limit, limit, (1, seq_len, num_outputs)), network.extern_data.data["classes"].size_placeholder[0]: numpy.array([seq_len]), } print("Creating session...") with tf.Session() as session: print("Init params...") network.initialize_params(session=session) print("Test run...") output_data1 = session.run( network.get_default_output_layer().output.placeholder, feed_dict=make_feed_dict(5)) assert_equal(output_data1.shape, (5, 1, num_outputs)) # (time, batch, dim) layer = network.layers["output"] loss_t = network.get_total_loss( ) * layer.get_loss_normalization_factor() weights_t = layer.params["W"] weights_grad_t, = tf.gradients(network.get_objective(), weights_t) def find_op_by_type(type_name): for op in session.graph.get_operations(): assert isinstance(op, tf.Operation) if op.type == type_name: return op lstm_grad_op = find_op_by_type("GradOfLstmGenericBase") assert lstm_grad_op is not None lstm_grad_ins_t = list(lstm_grad_op.inputs) lstm_grad_outs_t = list(lstm_grad_op.outputs) lstm_grad_func = _lstm_grad_op(session=session) demo_grad_t = lstm_grad_func(*_demo_lstm_grad_args()) demo_grad2_input_placeholders = [ tf.placeholder(v.dtype) for v in lstm_grad_ins_t ] demo_grad2_t = lstm_grad_func(*demo_grad2_input_placeholders)[1] print("Create updater...") from TFUpdater import Updater updater = Updater(config=config, network=network, tf_session=session) updater.set_trainable_vars(network.get_trainable_params()) updater.set_learning_rate(0.1) optim_op = updater.get_optim_op() assert isinstance(updater.optimizer, tf.train.AdamOptimizer) adam_weights_m_t = updater.optimizer.get_slot(var=weights_t, name="m") adam_weights_v_t = updater.optimizer.get_slot(var=weights_t, name="v") assert isinstance(adam_weights_m_t, tf.Variable) assert isinstance(adam_weights_v_t, tf.Variable) summaries_t = tf.summary.merge_all() # https://github.com/tensorflow/tensorflow/blob/03beb65cecbc1e49ea477bca7f54543134b31d53/tensorflow/core/kernels/training_ops_gpu.cu.cc adam_update_t = adam_weights_m_t / (tf.sqrt(adam_weights_v_t) + 1e-8) import tempfile tmp_tf_logdir = tempfile.mkdtemp("tmp-tf-log") print("Write TF logs to:", tmp_tf_logdir) writer = tf.summary.FileWriter(tmp_tf_logdir) writer.add_graph(session.graph) print("Training...") recent_info = [] # type: list[dict[str]] for i in range(10000): feed_dict = make_feed_dict(5) weights_grad, lstm_grad_ins, lstm_grad_outs = session.run( [weights_grad_t, lstm_grad_ins_t, lstm_grad_outs_t], feed_dict=feed_dict) try: if not numpy.all(numpy.isfinite(weights_grad)): raise Exception("weights_grad has inf or nan.") loss, _opt, summaries, weights, adam_update = session.run( [loss_t, optim_op, summaries_t, weights_t, adam_update_t], feed_dict=feed_dict) except Exception as exc: print("Exception in step %i." % i) print(exc) print("Most recent summaries:") summary_proto = tf.Summary() summary_proto.ParseFromString(recent_info[-1]["summaries"]) for val in summary_proto.value: # Assuming all summaries are scalars. print(" %s: %r" % (val.tag, val.simple_value)) print("Most recent weights:") print(recent_info[-1]["weights"]) print("Current weights:") print(session.run(weights_t)) print("Most recent Adam update:") print(recent_info[-1]["adam_update"]) print("Current Adam update:") print(session.run(adam_update_t)) print("Used weights grad:") print(weights_grad) print("GradOfLstmGenericBase inputs:") for t, v in zip(lstm_grad_ins_t, lstm_grad_ins): print("%r:" % t) print(repr(v)) print("GradOfLstmGenericBase outputs:") for t, v in zip(lstm_grad_outs_t, lstm_grad_outs): print("%r:" % t) print(repr(v)) print("Demo grad:") print(session.run(demo_grad_t)) print("Demo grad2:") print( session.run( demo_grad2_t, feed_dict={ k: v for (k, v) in zip(demo_grad2_input_placeholders, lstm_grad_ins) })) print("Demo grad2 via eval:") print( session.run( demo_grad2_t, feed_dict={ k: eval(repr(v), vars(numpy)) for (k, v) in zip(demo_grad2_input_placeholders, lstm_grad_ins) })) print("Demo grad2 via args:") print( session.run( demo_grad2_t, feed_dict={ k: v for (k, v) in zip(demo_grad2_input_placeholders, _demo_lstm_grad_args()) })) raise Exception("Exception in step %i." % i) writer.add_summary(summaries, global_step=i) if len(recent_info) > 1000: recent_info.pop(0) recent_info.append({ "step": i, "loss": loss, "summaries": summaries, "weights": weights, "adam_update": adam_update }) if not numpy.isfinite(loss) or i % 100 == 0: print("step %i, loss: %r" % (i, loss)) assert numpy.isfinite(loss) print("Done.") import shutil shutil.rmtree(tmp_tf_logdir)