def _layer_params(self, info, sources, mask, reverse=False): """ :param dict[str] info: self.hidden_info[i] :param list[str] sources: 'from' entry :param None | str mask: mask :param bool reverse: reverse or not :rtype: dict[str] """ from returnn.util.basic import BackendEngine, getargspec if BackendEngine.is_theano_selected(): from returnn.theano.layers.basic import get_layer_class elif BackendEngine.is_tensorflow_selected(): from returnn.tf.layers.basic import get_layer_class else: raise NotImplementedError params = dict(self.default_layer_info) params.update(info) params["from"] = sources if mask: params["mask"] = mask layer_class = get_layer_class(params["layer_class"]) if layer_class.recurrent: params['truncation'] = self.truncation if self.bidirectional: if not reverse: params['name'] += "_fw" else: params['name'] += "_bw" params['reverse'] = True if 'sharpgates' in getargspec(layer_class.__init__).args[1:]: params['sharpgates'] = self.sharpgates return params
def init_backend_engine(): """ Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`. """ BackendEngine.select_engine(config=config) if BackendEngine.is_theano_selected(): print("Theano:", describe_theano_version(), file=log.v3) import returnn.theano.util returnn.theano.util.monkey_patches() elif BackendEngine.is_tensorflow_selected(): print("TensorFlow:", describe_tensorflow_version(), file=log.v3) if get_tensorflow_version_tuple()[0] == 0: print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2) if os.environ.get("TF_DEVICE"): print("Devices: Use %s via TF_DEVICE instead of %s." % (os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4) config.set("device", os.environ.get("TF_DEVICE")) if config.is_true("use_horovod"): import returnn.tf.horovod hvd = returnn.tf.horovod.get_ctx(config=config) import socket if "gpu" in config.value("device", "") or os.environ.get( "CUDA_VISIBLE_DEVICES", ""): # We assume that we want to use a GPU. gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault( "gpu_options", {}) assert "visible_device_list" not in gpu_opts gpu_opts["visible_device_list"] = str(hvd.local_rank()) print("Horovod: Hostname %s, pid %i, using GPU %s." % (socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3) else: if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Not using GPU.", file=log.v3) if hvd.rank() == 0: # Don't spam in all ranks. print("Horovod: Reduce type:", hvd.get_reduce_type(), file=log.v3) from returnn.tf.util.basic import debug_register_better_repr, setup_tf_thread_pools, print_available_devices tf_session_opts = config.typed_value("tf_session_opts", {}) assert isinstance(tf_session_opts, dict) # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch. setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts) # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts. print_available_devices(tf_session_opts=tf_session_opts, file=log.v2) from returnn.tf.native_op import OpMaker OpMaker.log_stream = log.v3 debug_register_better_repr() if config.is_true("distributed_tf"): import returnn.tf.distributed returnn.tf.distributed.init_distributed_tf(config) else: raise NotImplementedError
def init_engine(devices): """ Initializes global engine. :type devices: list[Device.Device]|None """ global engine if BackendEngine.is_theano_selected(): from returnn.theano.engine import Engine engine = Engine(devices) elif BackendEngine.is_tensorflow_selected(): from returnn.tf.engine import Engine engine = Engine(config=config) else: raise NotImplementedError
def finalize(error_occurred=False): """ Cleanup at the end. :param bool error_occurred: """ print("Quitting", file=getattr(log, "v4", sys.stderr)) global quit_returnn quit_returnn = True sys.exited = True if engine: if BackendEngine.is_theano_selected(): for device in engine.devices: device.terminate() elif BackendEngine.is_tensorflow_selected(): engine.finalize(error_occurred=error_occurred)
def get_existing_models(cls, config): """ :param Config.Config config: :return: dict epoch -> model filename :rtype: dict[int,str] """ model_filename = config.value('model', '') if not model_filename: return [] # Automatically search the filesystem for existing models. file_list = {} for epoch in range(1, cls.config_get_final_epoch(config) + 1): for is_pretrain in [False, True]: fn = cls.epoch_model_filename(model_filename, epoch, is_pretrain) if os.path.exists(fn): file_list[epoch] = fn break if BackendEngine.is_tensorflow_selected(): if os.path.exists(fn + ".index"): file_list[epoch] = fn break return file_list
def num_inputs_outputs_from_config(cls, config): """ :type config: Config.Config :returns (num_inputs, num_outputs), where num_inputs is like num_outputs["data"][0], and num_outputs is a dict of data_key -> (dim, ndim), where data_key is e.g. "classes" or "data", dim is the feature dimension or the number of classes, and ndim is the ndim counted without batch-dim, i.e. ndim=1 means usually sparse data and ndim=2 means dense data. :rtype: (int,dict[str,(int,int)]) """ from returnn.util.basic import BackendEngine num_inputs = config.int('num_inputs', 0) target = config.value('target', 'classes') if config.is_typed('num_outputs'): num_outputs = config.typed_value('num_outputs') if not isinstance(num_outputs, dict): num_outputs = {target: num_outputs} num_outputs = num_outputs.copy() from returnn.datasets.basic import convert_data_dims num_outputs = convert_data_dims(num_outputs, leave_dict_as_is=BackendEngine.is_tensorflow_selected()) if "data" in num_outputs: num_inputs = num_outputs["data"] if isinstance(num_inputs, (list, tuple)): num_inputs = num_inputs[0] elif isinstance(num_inputs, dict): if "dim" in num_inputs: num_inputs = num_inputs["dim"] else: num_inputs = num_inputs["shape"][-1] else: raise TypeError("data key %r" % num_inputs) elif config.has('num_outputs'): num_outputs = {target: [config.int('num_outputs', 0), 1]} else: num_outputs = None dataset = None if config.list('train') and ":" not in config.value('train', ''): dataset = config.list('train')[0] if not config.is_typed('num_outputs') and dataset: # noinspection PyBroadException try: _num_inputs = hdf5_dimension(dataset, 'inputCodeSize') * config.int('window', 1) except Exception: _num_inputs = hdf5_dimension(dataset, 'inputPattSize') * config.int('window', 1) # noinspection PyBroadException try: _num_outputs = {target: [hdf5_dimension(dataset, 'numLabels'), 1]} except Exception: _num_outputs = hdf5_group(dataset, 'targets/size') for k in _num_outputs: _num_outputs[k] = [_num_outputs[k], len(hdf5_shape(dataset, 'targets/data/' + k))] if num_inputs: assert num_inputs == _num_inputs if num_outputs: assert num_outputs == _num_outputs num_inputs = _num_inputs num_outputs = _num_outputs if not num_inputs and not num_outputs and config.has("load") and BackendEngine.is_theano_selected(): from returnn.theano.network import LayerNetwork import h5py model = h5py.File(config.value("load", ""), "r") # noinspection PyProtectedMember num_inputs, num_outputs = LayerNetwork._n_in_out_from_hdf_model(model) assert num_inputs and num_outputs, "provide num_inputs/num_outputs directly or via train" return num_inputs, num_outputs
def _forward(segment_name, features): """ :param numpy.ndarray features: format (input-feature,time) (via Sprint) :return: format (output-dim,time) :rtype: numpy.ndarray """ print("Sprint forward", segment_name, features.shape) start_time = time.time() assert engine is not None, "not initialized" assert sprintDataset # Features are in Sprint format (feature,time). num_time = features.shape[1] assert features.shape == (InputDim, num_time) dataset, seq_idx = features_to_dataset(features=features, segment_name=segment_name) if BackendEngine.is_theano_selected(): # Prepare data for device. device = engine.devices[0] from returnn.theano.engine_util import assign_dev_data_single_seq success = assign_dev_data_single_seq(device, dataset=dataset, seq=seq_idx) assert success, "failed to allocate & assign data for seq %i, %s" % ( seq_idx, segment_name) # Do the actual forwarding and collect result. device.run("extract") result, _ = device.result() assert result is not None, "Device crashed." assert len(result) == 1 posteriors = result[0] elif BackendEngine.is_tensorflow_selected(): posteriors = engine.forward_single(dataset=dataset, seq_idx=seq_idx) else: raise NotImplementedError("unknown backend engine") # If we have a sequence training criterion, posteriors might be in format (time,seq|batch,emission). if posteriors.ndim == 3: assert posteriors.shape == (num_time, 1, OutputDim * MaxSegmentLength) posteriors = posteriors[:, 0] # Posteriors are in format (time,emission). assert posteriors.shape == (num_time, OutputDim * MaxSegmentLength) # Reformat to Sprint expected format (emission,time). posteriors = posteriors.transpose() assert posteriors.shape == (OutputDim * MaxSegmentLength, num_time) stats = (numpy.min(posteriors), numpy.max(posteriors), numpy.mean(posteriors), numpy.std(posteriors)) print("posteriors min/max/mean/std:", stats, "time:", time.time() - start_time) if numpy.isinf(posteriors).any() or numpy.isnan(posteriors).any(): print("posteriors:", posteriors) debug_feat_fn = "/tmp/returnn.pid%i.sprintinterface.debug.features.txt" % os.getpid( ) debug_post_fn = "/tmp/returnn.pid%i.sprintinterface.debug.posteriors.txt" % os.getpid( ) print("Wrote to files %s, %s" % (debug_feat_fn, debug_post_fn)) numpy.savetxt(debug_feat_fn, features) numpy.savetxt(debug_post_fn, posteriors) assert False, "Error, posteriors contain invalid numbers." return posteriors
def _init_base(configfile=None, target_mode=None, epoch=None, sprint_opts=None): """ :param str|None configfile: filename, via init(), this is set :param str|None target_mode: "forward" or so. via init(), this is set :param int epoch: via init(), this is set :param dict[str,str]|None sprint_opts: optional parameters to override values in configfile """ global isInitialized isInitialized = True # Run through in any case. Maybe just to set targetMode. if not getattr(sys, "argv", None): # Set some dummy. Some code might want this (e.g. TensorFlow). sys.argv = [__file__] global Engine global config if not config: # Some subset of what we do in rnn.init(). rnn.init_better_exchook() rnn.init_thread_join_hack() if configfile is None: configfile = DefaultSprintCrnnConfig assert os.path.exists(configfile) rnn.init_config(config_filename=configfile, extra_updates={"task": target_mode}) assert rnn.config config = rnn.config if sprint_opts is not None: config.update(sprint_opts) rnn.init_log() rnn.returnn_greeting(config_filename=configfile) rnn.init_backend_engine() rnn.init_faulthandler(sigusr1_chain=True) rnn.init_config_json_network() if BackendEngine.is_tensorflow_selected(): # Use TFEngine.Engine class instead of Engine.Engine. from returnn.tf.engine import Engine elif BackendEngine.is_theano_selected(): from returnn.theano.engine import Engine import atexit atexit.register(_at_exit_handler) if target_mode: set_target_mode(target_mode) _init_dataset() if target_mode and target_mode == "forward" and epoch: model_filename = config.value('model', '') fns = [ EngineBase.epoch_model_filename(model_filename, epoch, is_pretrain) for is_pretrain in [False, True] ] fn_postfix = "" if BackendEngine.is_tensorflow_selected(): fn_postfix += ".meta" fns_existing = [fn for fn in fns if os.path.exists(fn + fn_postfix)] assert len(fns_existing) == 1, "%s not found" % fns model_epoch_filename = fns_existing[0] config.set('load', model_epoch_filename) assert EngineBase.get_epoch_model(config)[1] == model_epoch_filename, ( "%r != %r" % (EngineBase.get_epoch_model(config), model_epoch_filename)) global engine if not engine: devices = rnn.init_theano_devices() rnn.print_task_properties(devices) rnn.init_engine(devices) engine = rnn.engine assert isinstance(engine, Engine)