def __init__(self, network_proto, graph, session, graph_type="train", batch_size=1, reuse_weights=False, input_dataset=None, codec=None): super().__init__(network_proto, graph_type, batch_size, input_dataset=input_dataset, codec=codec) self.graph = graph self.session = session self.gpu_available = any([d.device_type == "GPU" for d in self.session.list_devices()]) # load fuzzy ctc module if available if len(network_proto.backend.fuzzy_ctc_library_path) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY: from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy self.fuzzy_module = load_fuzzy(network_proto.backend.fuzzy_ctc_library_path) else: self.fuzzy_module = None # create graph with self.graph.as_default(): tf.set_random_seed(self.network_proto.backend.random_seed) # inputs as data set (faster) self.inputs, self.input_seq_len, self.targets, self.dropout_rate, self.data_iterator, self.serialized_params = \ self.create_dataset_inputs(batch_size, network_proto.features) # create network and solver (if train) if graph_type == "train": self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \ self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights) self.train_op, self.loss, self.cer = self.create_solver(self.targets, self.time_major_logits, self.logits, self.output_seq_len, self.decoded) elif graph_type == "test": self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \ self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights) self.cer = self.create_cer(self.decoded, self.targets) else: self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \ self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights)
def from_proto(network_proto): reuse_variables = False intra_threads = network_proto.backend.num_intra_threads inter_threads = network_proto.backend.num_inter_threads # load fuzzy ctc module if available if len(network_proto.backend.fuzzy_ctc_library_path ) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY: from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy fuzzy_module = load_fuzzy( network_proto.backend.fuzzy_ctc_library_path) else: fuzzy_module = None graph = tf.Graph() with graph.as_default(): tf.set_random_seed(network_proto.backend.random_seed) session = tf.Session( graph=graph, config=tf.ConfigProto( intra_op_parallelism_threads=intra_threads, inter_op_parallelism_threads=inter_threads, )) gpu_enabled = False for d in session.list_devices(): if d.device_type == "GPU": gpu_enabled = True break inputs = tf.placeholder(tf.float32, shape=(None, None, network_proto.features), name="inputs") batch_size = tf.shape(inputs)[0] seq_len = tf.placeholder(tf.int32, shape=(None, ), name="seq_len") targets = tf.sparse_placeholder(tf.int32, shape=(None, None), name="targets") dropout_rate = tf.placeholder(tf.float32, shape=(), name="dropout_rate") with tf.variable_scope("", reuse=reuse_variables) as scope: no_layers = len(network_proto.layers) == 0 if not no_layers: has_conv_or_pool = network_proto.layers[ 0].type != LayerParams.LSTM else: has_conv_or_pool = False if has_conv_or_pool: cnn_inputs = tf.reshape( inputs, [batch_size, -1, network_proto.features, 1]) shape = seq_len, network_proto.features layers = [cnn_inputs] last_num_filters = 1 for layer in [ l for l in network_proto.layers if l.type != LayerParams.LSTM ]: if layer.type == LayerParams.CONVOLUTIONAL: layers.append( tf.layers.conv2d( inputs=layers[-1], filters=layer.filters, kernel_size=(layer.kernel_size.x, layer.kernel_size.y), padding="same", activation=tf.nn.relu, )) last_num_filters = layer.filters elif layer.type == LayerParams.MAX_POOLING: layers.append( tf.layers.max_pooling2d( inputs=layers[-1], pool_size=(layer.kernel_size.x, layer.kernel_size.y), strides=(layer.stride.x, layer.stride.y), padding="same", )) shape = (tf.to_int32(shape[0] // layer.stride.x), shape[1] // layer.stride.y) else: raise Exception("Unknown layer of type %s" % layer.type) lstm_seq_len, lstm_num_features = shape rnn_inputs = tf.reshape(layers[-1], [ batch_size, tf.shape(layers[-1])[1], last_num_filters * lstm_num_features ]) lstm_num_features = last_num_filters * lstm_num_features else: rnn_inputs = inputs lstm_seq_len = seq_len lstm_num_features = network_proto.features lstm_layers = [ l for l in network_proto.layers if l.type == LayerParams.LSTM ] # Time major inputs required for lstm time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2]) if len(lstm_layers) > 0: for i, lstm in enumerate(lstm_layers): if lstm.hidden_nodes != lstm_layers[0].hidden_nodes: raise Exception( "Currently all lstm layers must have an equal number of hidden nodes. " "Got {} != {}".format( lstm.hidden_nodes, lstm_layers[0].hidden_nodes)) def cpu_cudnn_compatible_lstm_backend( time_major_inputs, hidden_nodes): def get_lstm_cell(num_hidden): return cudnn_rnn.CudnnCompatibleLSTMCell( num_hidden, reuse=reuse_variables) fw, bw = zip(*[(get_lstm_cell(hidden_nodes), get_lstm_cell(hidden_nodes)) for lstm in lstm_layers]) time_major_outputs, output_fw, output_bw \ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs, sequence_length=lstm_seq_len, dtype=tf.float32, scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name), time_major=True, ) return time_major_outputs def gpu_cudnn_lstm_backend(time_major_inputs, hidden_nodes): # Create the Cudnn LSTM factory rnn_lstm = cudnn_rnn.CudnnLSTM( len(lstm_layers), hidden_nodes, direction='bidirectional', kernel_initializer=tf.initializers.random_uniform( -0.1, 0.1)) # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable # Apply the lstm to the inputs time_major_outputs, ( output_h, output_c) = rnn_lstm(time_major_inputs) return time_major_outputs if network_proto.backend.cudnn: if gpu_enabled: print("Using CUDNN LSTM backend on GPU") time_major_outputs = gpu_cudnn_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: print("Using CUDNN compatible LSTM backend on CPU") time_major_outputs = cpu_cudnn_compatible_lstm_backend( time_major_inputs, lstm_layers[0].hidden_nodes) else: raise Exception( "Only cudnn based backend supported yet.") # Set the output size output_size = lstm_layers[-1].hidden_nodes * 2 else: output_size = lstm_num_features time_major_outputs = time_major_inputs # flatten to (T * N, F) for matrix multiplication. This will be reversed later time_major_outputs = tf.reshape( time_major_outputs, [-1, time_major_outputs.shape.as_list()[2]]) if network_proto.dropout > 0: time_major_outputs = tf.nn.dropout(time_major_outputs, 1 - dropout_rate, name="dropout") # we need to turn off validate_shape so we can resize the variable on a codec resize W = tf.get_variable('W', validate_shape=False, initializer=tf.random_uniform( [output_size, network_proto.classes], -0.1, 0.1)) b = tf.get_variable('B', validate_shape=False, initializer=tf.constant( 0., shape=[network_proto.classes])) # the output layer time_major_logits = tf.matmul(time_major_outputs, W) + b # reshape back time_major_logits = tf.reshape( time_major_logits, [-1, batch_size, tf.shape(W)[-1]], name="time_major_logits") time_major_softmax = tf.nn.softmax(time_major_logits, -1, "time_major_softmax") logits = tf.transpose(time_major_logits, [1, 0, 2], name="logits") softmax = tf.transpose(time_major_softmax, [1, 0, 2], name="softmax") # ctc predictions # Note for codec change: the codec size is derived upon creation, therefore the ctc ops must be created # using the true codec size (the W/B-Matrix may change its shape however during loading/codec change # to match the true codec size if network_proto.ctc == NetworkParams.CTC_DEFAULT: loss = ctc_ops.ctc_loss( targets, time_major_logits, lstm_seq_len, time_major=True, ctc_merge_repeated=network_proto.ctc_merge_repeated, ignore_longer_outputs_than_inputs=True) decoded, log_prob = ctc_ops.ctc_greedy_decoder( time_major_logits, lstm_seq_len, merge_repeated=network_proto.ctc_merge_repeated) # decoded, log_prob = ctc_ops.ctc_beam_search_decoder(time_major_logits, lstm_seq_len, merge_repeated=model_settings["merge_repeated"]) elif network_proto.ctc == NetworkParams.CTC_FUZZY: loss, deltas = fuzzy_module['module'].fuzzy_ctc_loss( logits, targets.indices, targets.values, lstm_seq_len, ignore_longer_outputs_than_inputs=True) decoded, log_prob = fuzzy_module['decoder_op']( softmax, lstm_seq_len) else: raise Exception( "Unknown ctc model: '%s'. Supported are Default and Fuzzy" % network_proto.ctc) decoded = decoded[0] sparse_decoded = ( tf.identity(decoded.indices, name="decoded_indices"), tf.identity(decoded.values, name="decoded_values"), tf.identity(decoded.dense_shape, name="decoded_shape"), ) cost = tf.reduce_mean(loss, name='cost') if network_proto.solver == NetworkParams.MOMENTUM_SOLVER: optimizer = tf.train.MomentumOptimizer( network_proto.learning_rate, network_proto.momentum) elif network_proto.solver == NetworkParams.ADAM_SOLVER: optimizer = tf.train.AdamOptimizer( network_proto.learning_rate) else: raise Exception("Unknown solver of type '%s'" % network_proto.solver) gvs = optimizer.compute_gradients(cost) training_ops = [] if network_proto.clipping_mode == NetworkParams.CLIP_NONE: pass elif network_proto.clipping_mode == NetworkParams.CLIP_AUTO: # exponentially follow the global average of gradients to set clipping ema = tf.train.ExponentialMovingAverage(decay=0.999) max_l2 = 1000 max_grads = 1000 grads = [grad for grad, _ in gvs] l2 = tf.minimum(tf.global_norm([grad for grad in grads]), max_l2) l2_ema_op, l2_ema = ema.apply([l2]), ema.average(l2) grads, _ = tf.clip_by_global_norm( grads, clip_norm=tf.minimum(l2_ema / max_l2 * max_grads, max_grads)) gvs = zip(grads, [var for _, var in gvs]) training_ops.append(l2_ema_op) elif network_proto.clipping_mode == NetworkParams.CLIP_CONSTANT: clip = network_proto.clipping_constant if clip <= 0: raise Exception( "Invalid clipping constant. Must be greater than 0, but got {}" .format(clip)) grads = [grad for grad, _ in gvs] grads, _ = tf.clip_by_global_norm(grads, clip_norm=clip) gvs = zip(grads, [var for _, var in gvs]) else: raise Exception("Unsupported clipping mode {}".format( network_proto.clipping_mode)) training_ops.append( optimizer.apply_gradients(gvs, name='grad_update_op')) train_op = tf.group(training_ops, name="train_op") ler = tf.reduce_mean(tf.edit_distance( tf.cast(decoded, tf.int32), targets), name='ler') lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out") return TensorflowModel(network_proto, graph, session, inputs, seq_len, lstm_seq_len, targets, train_op, cost, ler, sparse_decoded, softmax, dropout_rate)