def __init__(self, network_proto, graph, session, graph_type="train", batch_size=1, reuse_weights=False,
                 input_dataset=None, codec=None):
        super().__init__(network_proto, graph_type, batch_size, input_dataset=input_dataset, codec=codec)
        self.graph = graph
        self.session = session
        self.gpu_available = any([d.device_type == "GPU" for d in self.session.list_devices()])

        # load fuzzy ctc module if available
        if len(network_proto.backend.fuzzy_ctc_library_path) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY:
            from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy
            self.fuzzy_module = load_fuzzy(network_proto.backend.fuzzy_ctc_library_path)
        else:
            self.fuzzy_module = None

        # create graph
        with self.graph.as_default():
            tf.set_random_seed(self.network_proto.backend.random_seed)

            # inputs as data set (faster)
            self.inputs, self.input_seq_len, self.targets, self.dropout_rate, self.data_iterator, self.serialized_params = \
                self.create_dataset_inputs(batch_size, network_proto.features)

            # create network and solver (if train)
            if graph_type == "train":
                self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \
                    self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights)
                self.train_op, self.loss, self.cer = self.create_solver(self.targets, self.time_major_logits, self.logits, self.output_seq_len, self.decoded)
            elif graph_type == "test":
                self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \
                    self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights)
                self.cer = self.create_cer(self.decoded, self.targets)
            else:
                self.output_seq_len, self.time_major_logits, self.time_major_softmax, self.logits, self.softmax, self.decoded, self.sparse_decoded, self.scale_factor = \
                    self.create_network(self.inputs, self.input_seq_len, self.dropout_rate, reuse_variables=reuse_weights)
示例#2
0
    def from_proto(network_proto):
        reuse_variables = False
        intra_threads = network_proto.backend.num_intra_threads
        inter_threads = network_proto.backend.num_inter_threads

        # load fuzzy ctc module if available
        if len(network_proto.backend.fuzzy_ctc_library_path
               ) > 0 and network_proto.ctc == NetworkParams.CTC_FUZZY:
            from calamari_ocr.ocr.backends.tensorflow_backend.tensorflow_fuzzy_ctc_loader import load as load_fuzzy
            fuzzy_module = load_fuzzy(
                network_proto.backend.fuzzy_ctc_library_path)
        else:
            fuzzy_module = None

        graph = tf.Graph()
        with graph.as_default():
            tf.set_random_seed(network_proto.backend.random_seed)
            session = tf.Session(
                graph=graph,
                config=tf.ConfigProto(
                    intra_op_parallelism_threads=intra_threads,
                    inter_op_parallelism_threads=inter_threads,
                ))
            gpu_enabled = False
            for d in session.list_devices():
                if d.device_type == "GPU":
                    gpu_enabled = True
                    break

            inputs = tf.placeholder(tf.float32,
                                    shape=(None, None, network_proto.features),
                                    name="inputs")
            batch_size = tf.shape(inputs)[0]
            seq_len = tf.placeholder(tf.int32, shape=(None, ), name="seq_len")
            targets = tf.sparse_placeholder(tf.int32,
                                            shape=(None, None),
                                            name="targets")
            dropout_rate = tf.placeholder(tf.float32,
                                          shape=(),
                                          name="dropout_rate")

            with tf.variable_scope("", reuse=reuse_variables) as scope:
                no_layers = len(network_proto.layers) == 0
                if not no_layers:
                    has_conv_or_pool = network_proto.layers[
                        0].type != LayerParams.LSTM
                else:
                    has_conv_or_pool = False

                if has_conv_or_pool:
                    cnn_inputs = tf.reshape(
                        inputs, [batch_size, -1, network_proto.features, 1])
                    shape = seq_len, network_proto.features

                    layers = [cnn_inputs]
                    last_num_filters = 1

                    for layer in [
                            l for l in network_proto.layers
                            if l.type != LayerParams.LSTM
                    ]:
                        if layer.type == LayerParams.CONVOLUTIONAL:
                            layers.append(
                                tf.layers.conv2d(
                                    inputs=layers[-1],
                                    filters=layer.filters,
                                    kernel_size=(layer.kernel_size.x,
                                                 layer.kernel_size.y),
                                    padding="same",
                                    activation=tf.nn.relu,
                                ))
                            last_num_filters = layer.filters
                        elif layer.type == LayerParams.MAX_POOLING:
                            layers.append(
                                tf.layers.max_pooling2d(
                                    inputs=layers[-1],
                                    pool_size=(layer.kernel_size.x,
                                               layer.kernel_size.y),
                                    strides=(layer.stride.x, layer.stride.y),
                                    padding="same",
                                ))

                            shape = (tf.to_int32(shape[0] // layer.stride.x),
                                     shape[1] // layer.stride.y)
                        else:
                            raise Exception("Unknown layer of type %s" %
                                            layer.type)

                    lstm_seq_len, lstm_num_features = shape
                    rnn_inputs = tf.reshape(layers[-1], [
                        batch_size,
                        tf.shape(layers[-1])[1],
                        last_num_filters * lstm_num_features
                    ])

                    lstm_num_features = last_num_filters * lstm_num_features
                else:
                    rnn_inputs = inputs
                    lstm_seq_len = seq_len
                    lstm_num_features = network_proto.features

                lstm_layers = [
                    l for l in network_proto.layers
                    if l.type == LayerParams.LSTM
                ]

                # Time major inputs required for lstm
                time_major_inputs = tf.transpose(rnn_inputs, [1, 0, 2])

                if len(lstm_layers) > 0:
                    for i, lstm in enumerate(lstm_layers):
                        if lstm.hidden_nodes != lstm_layers[0].hidden_nodes:
                            raise Exception(
                                "Currently all lstm layers must have an equal number of hidden nodes. "
                                "Got {} != {}".format(
                                    lstm.hidden_nodes,
                                    lstm_layers[0].hidden_nodes))

                    def cpu_cudnn_compatible_lstm_backend(
                            time_major_inputs, hidden_nodes):
                        def get_lstm_cell(num_hidden):
                            return cudnn_rnn.CudnnCompatibleLSTMCell(
                                num_hidden, reuse=reuse_variables)

                        fw, bw = zip(*[(get_lstm_cell(hidden_nodes),
                                        get_lstm_cell(hidden_nodes))
                                       for lstm in lstm_layers])

                        time_major_outputs, output_fw, output_bw \
                            = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(list(fw), list(bw), time_major_inputs,
                                                                             sequence_length=lstm_seq_len,
                                                                             dtype=tf.float32,
                                                                             scope="{}cudnn_lstm/stack_bidirectional_rnn".format(scope.name),
                                                                             time_major=True,
                                                                             )

                        return time_major_outputs

                    def gpu_cudnn_lstm_backend(time_major_inputs,
                                               hidden_nodes):
                        # Create the Cudnn LSTM factory
                        rnn_lstm = cudnn_rnn.CudnnLSTM(
                            len(lstm_layers),
                            hidden_nodes,
                            direction='bidirectional',
                            kernel_initializer=tf.initializers.random_uniform(
                                -0.1, 0.1))

                        # TODO: Check if the models are loadable from meta Graph, maybe the next line fixed this
                        rnn_lstm._saveable_cls = cudnn_rnn.CudnnLSTMSaveable

                        # Apply the lstm to the inputs
                        time_major_outputs, (
                            output_h, output_c) = rnn_lstm(time_major_inputs)
                        return time_major_outputs

                    if network_proto.backend.cudnn:
                        if gpu_enabled:
                            print("Using CUDNN LSTM backend on GPU")
                            time_major_outputs = gpu_cudnn_lstm_backend(
                                time_major_inputs, lstm_layers[0].hidden_nodes)
                        else:
                            print("Using CUDNN compatible LSTM backend on CPU")
                            time_major_outputs = cpu_cudnn_compatible_lstm_backend(
                                time_major_inputs, lstm_layers[0].hidden_nodes)
                    else:
                        raise Exception(
                            "Only cudnn based backend supported yet.")

                    # Set the output size
                    output_size = lstm_layers[-1].hidden_nodes * 2
                else:
                    output_size = lstm_num_features
                    time_major_outputs = time_major_inputs

                # flatten to (T * N, F) for matrix multiplication. This will be reversed later
                time_major_outputs = tf.reshape(
                    time_major_outputs,
                    [-1, time_major_outputs.shape.as_list()[2]])

                if network_proto.dropout > 0:
                    time_major_outputs = tf.nn.dropout(time_major_outputs,
                                                       1 - dropout_rate,
                                                       name="dropout")

                # we need to turn off validate_shape so we can resize the variable on a codec resize
                W = tf.get_variable('W',
                                    validate_shape=False,
                                    initializer=tf.random_uniform(
                                        [output_size, network_proto.classes],
                                        -0.1, 0.1))
                b = tf.get_variable('B',
                                    validate_shape=False,
                                    initializer=tf.constant(
                                        0., shape=[network_proto.classes]))

                # the output layer
                time_major_logits = tf.matmul(time_major_outputs, W) + b

                # reshape back
                time_major_logits = tf.reshape(
                    time_major_logits,
                    [-1, batch_size, tf.shape(W)[-1]],
                    name="time_major_logits")

                time_major_softmax = tf.nn.softmax(time_major_logits, -1,
                                                   "time_major_softmax")

                logits = tf.transpose(time_major_logits, [1, 0, 2],
                                      name="logits")
                softmax = tf.transpose(time_major_softmax, [1, 0, 2],
                                       name="softmax")

                # ctc predictions
                # Note for codec change: the codec size is derived upon creation, therefore the ctc ops must be created
                # using the true codec size (the W/B-Matrix may change its shape however during loading/codec change
                # to match the true codec size
                if network_proto.ctc == NetworkParams.CTC_DEFAULT:
                    loss = ctc_ops.ctc_loss(
                        targets,
                        time_major_logits,
                        lstm_seq_len,
                        time_major=True,
                        ctc_merge_repeated=network_proto.ctc_merge_repeated,
                        ignore_longer_outputs_than_inputs=True)
                    decoded, log_prob = ctc_ops.ctc_greedy_decoder(
                        time_major_logits,
                        lstm_seq_len,
                        merge_repeated=network_proto.ctc_merge_repeated)
                    # decoded, log_prob = ctc_ops.ctc_beam_search_decoder(time_major_logits, lstm_seq_len, merge_repeated=model_settings["merge_repeated"])
                elif network_proto.ctc == NetworkParams.CTC_FUZZY:
                    loss, deltas = fuzzy_module['module'].fuzzy_ctc_loss(
                        logits,
                        targets.indices,
                        targets.values,
                        lstm_seq_len,
                        ignore_longer_outputs_than_inputs=True)
                    decoded, log_prob = fuzzy_module['decoder_op'](
                        softmax, lstm_seq_len)
                else:
                    raise Exception(
                        "Unknown ctc model: '%s'. Supported are Default and Fuzzy"
                        % network_proto.ctc)

                decoded = decoded[0]
                sparse_decoded = (
                    tf.identity(decoded.indices, name="decoded_indices"),
                    tf.identity(decoded.values, name="decoded_values"),
                    tf.identity(decoded.dense_shape, name="decoded_shape"),
                )

                cost = tf.reduce_mean(loss, name='cost')
                if network_proto.solver == NetworkParams.MOMENTUM_SOLVER:
                    optimizer = tf.train.MomentumOptimizer(
                        network_proto.learning_rate, network_proto.momentum)
                elif network_proto.solver == NetworkParams.ADAM_SOLVER:
                    optimizer = tf.train.AdamOptimizer(
                        network_proto.learning_rate)
                else:
                    raise Exception("Unknown solver of type '%s'" %
                                    network_proto.solver)

                gvs = optimizer.compute_gradients(cost)

                training_ops = []
                if network_proto.clipping_mode == NetworkParams.CLIP_NONE:
                    pass
                elif network_proto.clipping_mode == NetworkParams.CLIP_AUTO:
                    # exponentially follow the global average of gradients to set clipping
                    ema = tf.train.ExponentialMovingAverage(decay=0.999)

                    max_l2 = 1000
                    max_grads = 1000

                    grads = [grad for grad, _ in gvs]
                    l2 = tf.minimum(tf.global_norm([grad for grad in grads]),
                                    max_l2)
                    l2_ema_op, l2_ema = ema.apply([l2]), ema.average(l2)
                    grads, _ = tf.clip_by_global_norm(
                        grads,
                        clip_norm=tf.minimum(l2_ema / max_l2 * max_grads,
                                             max_grads))
                    gvs = zip(grads, [var for _, var in gvs])
                    training_ops.append(l2_ema_op)
                elif network_proto.clipping_mode == NetworkParams.CLIP_CONSTANT:
                    clip = network_proto.clipping_constant
                    if clip <= 0:
                        raise Exception(
                            "Invalid clipping constant. Must be greater than 0, but got {}"
                            .format(clip))

                    grads = [grad for grad, _ in gvs]
                    grads, _ = tf.clip_by_global_norm(grads, clip_norm=clip)
                    gvs = zip(grads, [var for _, var in gvs])
                else:
                    raise Exception("Unsupported clipping mode {}".format(
                        network_proto.clipping_mode))

                training_ops.append(
                    optimizer.apply_gradients(gvs, name='grad_update_op'))
                train_op = tf.group(training_ops, name="train_op")

                ler = tf.reduce_mean(tf.edit_distance(
                    tf.cast(decoded, tf.int32), targets),
                                     name='ler')

                lstm_seq_len = tf.identity(lstm_seq_len, "seq_len_out")

                return TensorflowModel(network_proto, graph, session, inputs,
                                       seq_len, lstm_seq_len, targets,
                                       train_op, cost, ler, sparse_decoded,
                                       softmax, dropout_rate)