Пример #1
0
 def makeProto(self, GLOBAL):
     return Props.proto + (
         PD('i', '', integer(), equalto('i', GLOBAL)),
         PD('m3', '', integer(), equalto('m2', GLOBAL)),
         PD('D3', '', integer(), equalto('D2', GLOBAL)),
         PD('j', '', integerOrNone(), 2),
         PD('k', '', integerOrNone(), 2),
         PD('l', '', integerOrNone(), 2),
     )
Пример #2
0
 def makeProto(self, GLOBAL):
     return Props.proto + (
         PD('i', '', integer(),
            LambdaVal(lambda _, __: GLOBAL.m + GLOBAL.D)),
         PD('m2', '', integer(), equalto('m', GLOBAL)),
         PD('D2', '', integer(), equalto('D', GLOBAL)),
         PD('j', '', integerOrNone(), None),
         PD('k', '', integerOrNone(), 1),
     )
Пример #3
0
class Im2LatexModelParams(dlc.HyperParams):
    proto = GlobalParams.proto + (
        ### Training Parameters ####
        PD(
            'assert_whole_batch',
            '(boolean): Disallow batch size that is not integral factor '
            'of the bin-size',
            boolean,
        ),
        PD(
            'squash_input_seq',
            '(boolean): Remove whitespace from target sequences',
            boolean,
        ),
        PD('input_queue_capacity', 'Capacity of input queue.', integer(1),
           LambdaVal(lambda _, d: d.num_towers * 1)),
        PD(
            'DecodingSlack',
            "Since we ignore blanks/spaces in loss and accuracy measurement, the network is free "
            "to insert blanks into the decoded/predicted sequence. Therefore the predicted sequence "
            "can be arbitrarily long. However, we need to limit the max decoded sequence length. We "
            "do so by determining the extra slack to give to the network - the more slack we give it "
            "presumably that much easier the learning will be. This parameter includes that slack. In "
            "other words, MaxDecodeLen = MaxSeqLen + DecodingSlack",
            integer(0), 20),
        PD('MaxDecodeLen',
           "See the description for MaxSeqLen and DecodingSlack", integer(151),
           LambdaVal(lambda _, p: p.MaxSeqLen + p.DecodingSlack)),
        PD(
            'SFactor',
            'Applicable to Scanning LSTM only: Multiplier to derive MaxS from MaxSeqLen',
            decimal(1.0),
            LambdaVal(lambda _, p: 1.5 if p.build_scanning_RNN else None)),
        PD(
            'MaxS',
            'Applicable to Scanning LSTM only: Max value of S for the given data-set',
            integer(1),
            LambdaVal(lambda _, p: int(p.MaxSeqLen * p.SFactor)
                      if p.build_scanning_RNN else None)),
        PD(
            'no_ctc_merge_repeated',
            "(boolean): Negated value of ctc_merge_repeated beamsearch_length_penatly for ctc operations",
            boolean, True),
        PD(
            'ctc_beam_width',
            'Beam Width to use for ctc_beamsearch_decoder, which is different from the seq2seq.BeamSearchDecoder',
            integer(1)),
        PD(
            'seq2seq_beam_width',
            'Beam Width to use for seq2seq.BeamSearchDecoder, which is different from the ctc_beamsearch_decoder',
            integer(1)),
        PD(
            'beamsearch_length_penalty',
            'length_penalty_weight used by beamsearch decoder. Same as alpha value of length-penalty described in https://arxiv.org/pdf/1609.08144.pdf'
            'In the paper they used a value of alpha in the range [0.6,0.7]. A value of 0 turns length-penalty off.',
            decimal(0., 1.),
            # 0.6
        ),
        PD('swap_memory', 'swap_memory option to tf.scan', boolean, False),
        PD(
            'tf_session_allow_growth',
            'tf ConfigProto.gpu_option_allow_growth. Setting this will allow the gpu memory to be allocated incrementally instead of all at once.',
            boolean,
            # False
        ),
        PD(
            'adam_alpha',
            '(float or None): alpha value (step, learning_rate) of adam optimizer.',
            instanceof(float),
            # 0.0001 # default in tf.train.AdamOptimizer is 0.001
        ),
        PD(
            'adam_beta1',
            'beta1 value of adam-optimizer. If undefined here, the default in tf.train.AdamOptimizer is is 0.9.',
            decimal(0., 1.),
        ),
        PD(
            'adam_beta2',
            'beta2 value of adam-optimizer. If undefined here, the default in tf.train.AdamOptimizer is is 0.999.',
            decimal(0., 1.),
            # 0.9
        ),
        PD('optimizer', 'tensorflow optimizer function (e.g. AdamOptimizer).',
           ('adam', ), 'adam'),
        PD(
            'no_towers',
            'Should be always set to False. Indicates code-switch to build without towers which will not work',
            (False, ), False),
        PD('num_gpus', 'Number of GPUs employed in parallel', integer(1)),
        PD(
            'towers_per_gpu', """
            Number of towers per GPU running concurrently. Multiple towers per gpu are 
            needed in order to circumvent OOM errors.""", integer(1)),
        PD(
            'num_towers', """
            Number of towers per GPU running concurrently. Multiple towers per gpu are 
            needed in order to circumvent OOM errors.""", integer(1),
            LambdaVal(lambda _, p: p.num_gpus * p.towers_per_gpu)),
        PD('data_reader_B', 'batch_size for the data_reader', integer(1),
           LambdaVal(lambda _, d: d.B * d.num_towers)),
        ### Embedding Layer ###
        PD(
            'embeddings_initializer',
            'Initializer for embedding weights',
            iscallable(),
            ## tf.contrib.layers.xavier_initializer()
            equalto('weights_initializer')),
        PD('embeddings_regularizer', 'L1 / L2 norm regularization',
           iscallableOrNone(), equalto('weights_regularizer')),
        ### ConvNet Params ###
        PD(
            'CONVNET',
            'ConvStackParams for the convent',
            instanceofOrNone(ConvStackParams),
            ## Value is set dynamically inside make_hyper
        ),
        PD('image_frame_width',
           'Width of an extra padding frame around the (possibly already padded) image. This extra padding is used '
           'in order to ensure that there is enough whites-space around the edges of the image, so as to enable VALID padding '
           'in the first conv-net layer without losing any information. The effect of doing this is to simulate SAME padding '
           'but using custom padding values (background color in this case) instead of zeroes (which is what SAME padding would do). '
           'This value should be equal to (kernel_size)//2 using kernel_size of the first convolution layer.',
           integer(),
           LambdaVal(lambda _, p: 0 if (p.build_image_context != 2) else
                     (p.CONVNET.layers[0].kernel_shape[0]) // 2)
           ## Dynamically set to = (kernel_size-1)/2 given kernel_size of first conv-net layer
           ),
        PD('image_shape', 'Shape of input images. Should be a python sequence.'
           '= image_shape_unpadded + image_frame_width around it',
           issequenceof(int),
           LambdaVal(lambda _, p: pad_image_shape(p.image_shape_unframed, p.
                                                  image_frame_width))
           ## = get_image_shape(raw_data_folder, num_channels, image_frame_width)
           ),
        ### Decoder CALSTM Params ###
        PD(
            'CALSTM_STACK',
            'sequence of CALSTMParams, one for each CALSTM layer in the stack. The paper '
            "has code for more than one layer, but mentions that it is not well-tested. I take that to mean "
            "that the published results are based on one layer alone.",
            issequenceof(CALSTMParams)),
        ### Output MLP
        PD(
            'output_reuse_embeddings',
            '(boolean): Output layer in the paper has a special first layer which considers embedding weights as part of the first-layer weight matrix.'
            'Setting this value to True (default) will follow the paper"s logic. Otherwise'
            "a straight MLP will be used wherein all inputs (including Ey(t-1)) are first concatenated and fed into an MLP."
            "Including the softmax layer, the paper uses a minimum of 2 layers.",
            boolean,
            # True
        ),
        PD(
            'outputMLP_skip_connections',
            '(boolean): Applicable only when output_reuse_embeddings==False. Setting this value to False will cause'
            'image context (z_t) and sequence input (Ex_t) to not be fed into the output MLP. If True (Default), the'
            'output MLP receives a concatenation of Ex_t, h_t and z_t as input. If set to False, only h_t is input.',
            boolean, True),
        PD('output_first_layer',
           "Some params of first layer of output MLP if output_reuse_embeddings==True",
           instanceof(Properties)
           ## Value set dynamically inside self._trickledown() iff output_reuse_embeddings==True
           ),
        PD(
            'output_layers',
            "(MLPParams): Parameters for the output MLP. The last layer outputs the logits and therefore "
            "must have num_units = K. If output_reuse_embeddings==True, an additional initial layer is created "
            "with num_units = m and activtion tanh. Therefore the min number of layers is 2 in that case. "
            "Note: In the paper all layers have num_units=m except the last(softmax) layer.",
            instanceof(MLPParams),
            ## Value set dynamically inside self._trickledown()
        ),
        ### Initializer MLP ###
        PD(
            'build_init_model', """ 
            Boolean parameter specifying whether or not to build the LSTM init_state model. If set to False zero-state
            will be used for init-state, otherwise a init-state model will be created based on other init_model_*
            params.
            """, boolean),
        PD(
            'init_model_input_transform', """
            Transform to apply to the image-context input to the init model. Only applies if build_init_model == True.
            'mean' implies take a mean across the 'L' image-locations and produce an input of size (batchsize, D).
            'full' implies take in all the 'L' features and produce an input tensor of shape (batchsize, L*D).
                Note that with this option the # of parameters in the first layer will go up by a factor of L i.e.
                around 100x.
            """, ('mean', 'full')),
        PD(
            'init_model_hidden',
            'MLP stack for hidden layers of the init_state model. In addition to the stack specified here, an additional FC '
            "layer will be forked off at the top for each 'c' and 'h' state in the RNN Im2LatexDecoderRNN state."
            "Hence, this is a 'multi-headed' MLP because it has multiple top-layers."
            "By default their implementation has num_hidden_layers==0 (i.e. n_layers_init==1).",
            instanceof(MLPParams),
            ## Value set dynamically inside self._trickledown()
        ),
        PD(
            'init_model_final_layers',
            '',
            instanceof(FCLayerParams),
            ## Value set dynamically inside self._trickledown()
        ),
        ### Loss / Cost Layer ###decoder_lstm
        PD(
            'sum_logloss',
            'Whether to normalize log-loss per sample as in standard log perplexity '
            'calculation or whether to just sum up log-losses as in the paper. Defaults'
            'to True in conformance with the paper.',
            boolean,
            # True
        ),
        PD(
            'MeanSumAlphaEquals1',
            '(boolean): When calculating the alpha penalty, the paper uses the term: '
            'square{1 - sum_over_t{alpha_t_i}}). This assumes that the mean sum_over_t should be 1. '
            "However, that's not true, since the mean of sum_over_t term should be C/L. This "
            "variable if set to True, causes the term to change to square{C/L - sum_over_t{alpha_t_i}}). "
            "The default value is True in conformance with the paper.",
            boolean,
            # True
        ),
        PD(
            'pLambda',
            'Lambda value for alpha penalty, Setting this to zero turns off alpha_penalty.',
            (0.0, 0.0005, 0.005, 0.0001, 0.05),
            # LambdaVal(lambda _, p: 0.005 if p.build_scanning_RNN else 0.000)
        ),  # default in the show-and-tell paper is .00001?
        PD(
            'target_aae',
            """
            Target mean_norm_AAE value to shoot for. Varies with data-set. Value discovered by experimentation.
            """,
            (0., 51.42, 51.79),
            # LambdaVal(lambda _, p: None if (p.pLambda == 0) else 51.42)
        ),
        PD(
            'target_ase',
            """
            Target mean_norm_ASE value to shoot for. Varies with data-set. Value discovered by experimentation.
            """,
            (0.0, 5.27, 5.35, 10.0),
            # LambdaVal(lambda _, p: None if (p.pLambda == 0) else (10.0 if p.build_scanning_RNN else 5.27))
            LambdaVal(lambda _, p: None if (p.pLambda == 0) else 5.27)),
        PD(
            'k',
            'Number of top-scoring beams to consider for best-of-k metrics.',
            integer(1),
            # Value specified in run.py
        ))

    def __init__(self, initVals):
        dlc.HyperParams.__init__(self, self.proto, initVals, seal=False)
        self._trickledown()

    def _trickledown(self):
        """
        Trickle changes down to dependant parameters in sub-tree(s).
        (For same level dependencies use LambdaFunctions instead.)
        Call at the end of __init__ and end of update.
        """
        ######## Output Model ########
        if self.output_reuse_embeddings:
            assert not self.build_scanning_RNN, 'Scanning RNN cannot reuse-embeddings because there are no embeddings'
            self.output_first_layer = FCLayerParams(self).updated({
                'num_units':
                self.m,
                'activation_fn':
                tf.nn.tanh,  # Shouldn't this be None?
                # dropout imported from outer scope
            }).freeze()

            self.output_layers = MLPParams(self).updated({
                # One layer with num_units = m is added if output_reuse_embeddings == True
                'op_name':
                'yLogits_MLP',
                # dropout imported from outer scope
                'layers': (
                    ## paper has activation set to relu for all but the softmax layer
                    ## paper has all hidden layers with num_units = m.
                    # TODO: num_units should probably be self.K otherwise the model is a reverse pyramid
                    FCLayerParams(self).updated({
                        'num_units': 64,
                        'activation_fn': tf.nn.relu
                    }).freeze(),
                    ## Last layer must have num_units = K and activation_fn=None because it outputs logits.
                    FCLayerParams(self).updated({
                        'num_units': self.K,
                        'activation_fn': None,
                        'dropout': None
                    }).freeze(),
                )
            }).freeze()
        else:
            self.output_layers = MLPParams(self).updated({
                'op_name':
                'yLogits_MLP',
                'layers': (
                    # paper has activation set to relu for all but the softmax layer
                    # paper has all hidden layers with num_units = m.
                    FCLayerParams(self).updated({
                        'num_units': 358,
                        'activation_fn': tf.nn.tanh
                    }).freeze(),
                    FCLayerParams(self).updated({
                        'num_units': 358,
                        'activation_fn': tf.nn.tanh
                    }).freeze(),
                    # Last layer must have num_units = K and activation_fn=None because it outputs logits.
                    FCLayerParams(self).updated({
                        'num_units': self.K,
                        'activation_fn': None,
                        'dropout': None
                    }).freeze(),
                )
            }).freeze()

        assert self.output_layers.layers[-2].num_units >= self.K
        assert self.output_layers.layers[
            -1].activation_fn == None, 'The last layer must have linear activation because softmax is added later (since we need logits for efficient cross-entropy calculation)'
        if (not self.output_reuse_embeddings):
            assert len(
                self.output_layers.layers
            ) >= 2, "Need one hidden layer at least to match the paper's complexity."

        ######## Init Model ########
        if self.build_init_model:
            # Note: There are no hidden init layers by default in the Show&Tell paper
            self.init_model_hidden = MLPParams(self).updated({
                'layers': (
                    # Show&Tell paper sets hidden activations=relu
                    # The Show&Tell paper's source sets all hidden units to D
                    FCLayerParams(self).updated({
                        'num_units': min(self.D, 100),
                        'activation_fn': tf.nn.tanh
                    }).freeze(), )
            }).freeze()

            self.init_model_final_layers = FCLayerParams(self).updated({
                # Show&Tell paper sets final=tanh
                'activation_fn':
                tf.nn.tanh,
                'dropout':
                None
            }).freeze()

    def __copy__(self):
        ## Shallow copy
        return self.__class__(self)

    def copy(self, override_vals={}):
        ## Shallow copy
        return self.__class__(self).updated(override_vals)