def makeProto(self, GLOBAL): return Props.proto + ( PD('i', '', integer(), equalto('i', GLOBAL)), PD('m3', '', integer(), equalto('m2', GLOBAL)), PD('D3', '', integer(), equalto('D2', GLOBAL)), PD('j', '', integerOrNone(), 2), PD('k', '', integerOrNone(), 2), PD('l', '', integerOrNone(), 2), )
def makeProto(self, GLOBAL): return Props.proto + ( PD('i', '', integer(), LambdaVal(lambda _, __: GLOBAL.m + GLOBAL.D)), PD('m2', '', integer(), equalto('m', GLOBAL)), PD('D2', '', integer(), equalto('D', GLOBAL)), PD('j', '', integerOrNone(), None), PD('k', '', integerOrNone(), 1), )
class Im2LatexModelParams(dlc.HyperParams): proto = GlobalParams.proto + ( ### Training Parameters #### PD( 'assert_whole_batch', '(boolean): Disallow batch size that is not integral factor ' 'of the bin-size', boolean, ), PD( 'squash_input_seq', '(boolean): Remove whitespace from target sequences', boolean, ), PD('input_queue_capacity', 'Capacity of input queue.', integer(1), LambdaVal(lambda _, d: d.num_towers * 1)), PD( 'DecodingSlack', "Since we ignore blanks/spaces in loss and accuracy measurement, the network is free " "to insert blanks into the decoded/predicted sequence. Therefore the predicted sequence " "can be arbitrarily long. However, we need to limit the max decoded sequence length. We " "do so by determining the extra slack to give to the network - the more slack we give it " "presumably that much easier the learning will be. This parameter includes that slack. In " "other words, MaxDecodeLen = MaxSeqLen + DecodingSlack", integer(0), 20), PD('MaxDecodeLen', "See the description for MaxSeqLen and DecodingSlack", integer(151), LambdaVal(lambda _, p: p.MaxSeqLen + p.DecodingSlack)), PD( 'SFactor', 'Applicable to Scanning LSTM only: Multiplier to derive MaxS from MaxSeqLen', decimal(1.0), LambdaVal(lambda _, p: 1.5 if p.build_scanning_RNN else None)), PD( 'MaxS', 'Applicable to Scanning LSTM only: Max value of S for the given data-set', integer(1), LambdaVal(lambda _, p: int(p.MaxSeqLen * p.SFactor) if p.build_scanning_RNN else None)), PD( 'no_ctc_merge_repeated', "(boolean): Negated value of ctc_merge_repeated beamsearch_length_penatly for ctc operations", boolean, True), PD( 'ctc_beam_width', 'Beam Width to use for ctc_beamsearch_decoder, which is different from the seq2seq.BeamSearchDecoder', integer(1)), PD( 'seq2seq_beam_width', 'Beam Width to use for seq2seq.BeamSearchDecoder, which is different from the ctc_beamsearch_decoder', integer(1)), PD( 'beamsearch_length_penalty', 'length_penalty_weight used by beamsearch decoder. Same as alpha value of length-penalty described in https://arxiv.org/pdf/1609.08144.pdf' 'In the paper they used a value of alpha in the range [0.6,0.7]. A value of 0 turns length-penalty off.', decimal(0., 1.), # 0.6 ), PD('swap_memory', 'swap_memory option to tf.scan', boolean, False), PD( 'tf_session_allow_growth', 'tf ConfigProto.gpu_option_allow_growth. Setting this will allow the gpu memory to be allocated incrementally instead of all at once.', boolean, # False ), PD( 'adam_alpha', '(float or None): alpha value (step, learning_rate) of adam optimizer.', instanceof(float), # 0.0001 # default in tf.train.AdamOptimizer is 0.001 ), PD( 'adam_beta1', 'beta1 value of adam-optimizer. If undefined here, the default in tf.train.AdamOptimizer is is 0.9.', decimal(0., 1.), ), PD( 'adam_beta2', 'beta2 value of adam-optimizer. If undefined here, the default in tf.train.AdamOptimizer is is 0.999.', decimal(0., 1.), # 0.9 ), PD('optimizer', 'tensorflow optimizer function (e.g. AdamOptimizer).', ('adam', ), 'adam'), PD( 'no_towers', 'Should be always set to False. Indicates code-switch to build without towers which will not work', (False, ), False), PD('num_gpus', 'Number of GPUs employed in parallel', integer(1)), PD( 'towers_per_gpu', """ Number of towers per GPU running concurrently. Multiple towers per gpu are needed in order to circumvent OOM errors.""", integer(1)), PD( 'num_towers', """ Number of towers per GPU running concurrently. Multiple towers per gpu are needed in order to circumvent OOM errors.""", integer(1), LambdaVal(lambda _, p: p.num_gpus * p.towers_per_gpu)), PD('data_reader_B', 'batch_size for the data_reader', integer(1), LambdaVal(lambda _, d: d.B * d.num_towers)), ### Embedding Layer ### PD( 'embeddings_initializer', 'Initializer for embedding weights', iscallable(), ## tf.contrib.layers.xavier_initializer() equalto('weights_initializer')), PD('embeddings_regularizer', 'L1 / L2 norm regularization', iscallableOrNone(), equalto('weights_regularizer')), ### ConvNet Params ### PD( 'CONVNET', 'ConvStackParams for the convent', instanceofOrNone(ConvStackParams), ## Value is set dynamically inside make_hyper ), PD('image_frame_width', 'Width of an extra padding frame around the (possibly already padded) image. This extra padding is used ' 'in order to ensure that there is enough whites-space around the edges of the image, so as to enable VALID padding ' 'in the first conv-net layer without losing any information. The effect of doing this is to simulate SAME padding ' 'but using custom padding values (background color in this case) instead of zeroes (which is what SAME padding would do). ' 'This value should be equal to (kernel_size)//2 using kernel_size of the first convolution layer.', integer(), LambdaVal(lambda _, p: 0 if (p.build_image_context != 2) else (p.CONVNET.layers[0].kernel_shape[0]) // 2) ## Dynamically set to = (kernel_size-1)/2 given kernel_size of first conv-net layer ), PD('image_shape', 'Shape of input images. Should be a python sequence.' '= image_shape_unpadded + image_frame_width around it', issequenceof(int), LambdaVal(lambda _, p: pad_image_shape(p.image_shape_unframed, p. image_frame_width)) ## = get_image_shape(raw_data_folder, num_channels, image_frame_width) ), ### Decoder CALSTM Params ### PD( 'CALSTM_STACK', 'sequence of CALSTMParams, one for each CALSTM layer in the stack. The paper ' "has code for more than one layer, but mentions that it is not well-tested. I take that to mean " "that the published results are based on one layer alone.", issequenceof(CALSTMParams)), ### Output MLP PD( 'output_reuse_embeddings', '(boolean): Output layer in the paper has a special first layer which considers embedding weights as part of the first-layer weight matrix.' 'Setting this value to True (default) will follow the paper"s logic. Otherwise' "a straight MLP will be used wherein all inputs (including Ey(t-1)) are first concatenated and fed into an MLP." "Including the softmax layer, the paper uses a minimum of 2 layers.", boolean, # True ), PD( 'outputMLP_skip_connections', '(boolean): Applicable only when output_reuse_embeddings==False. Setting this value to False will cause' 'image context (z_t) and sequence input (Ex_t) to not be fed into the output MLP. If True (Default), the' 'output MLP receives a concatenation of Ex_t, h_t and z_t as input. If set to False, only h_t is input.', boolean, True), PD('output_first_layer', "Some params of first layer of output MLP if output_reuse_embeddings==True", instanceof(Properties) ## Value set dynamically inside self._trickledown() iff output_reuse_embeddings==True ), PD( 'output_layers', "(MLPParams): Parameters for the output MLP. The last layer outputs the logits and therefore " "must have num_units = K. If output_reuse_embeddings==True, an additional initial layer is created " "with num_units = m and activtion tanh. Therefore the min number of layers is 2 in that case. " "Note: In the paper all layers have num_units=m except the last(softmax) layer.", instanceof(MLPParams), ## Value set dynamically inside self._trickledown() ), ### Initializer MLP ### PD( 'build_init_model', """ Boolean parameter specifying whether or not to build the LSTM init_state model. If set to False zero-state will be used for init-state, otherwise a init-state model will be created based on other init_model_* params. """, boolean), PD( 'init_model_input_transform', """ Transform to apply to the image-context input to the init model. Only applies if build_init_model == True. 'mean' implies take a mean across the 'L' image-locations and produce an input of size (batchsize, D). 'full' implies take in all the 'L' features and produce an input tensor of shape (batchsize, L*D). Note that with this option the # of parameters in the first layer will go up by a factor of L i.e. around 100x. """, ('mean', 'full')), PD( 'init_model_hidden', 'MLP stack for hidden layers of the init_state model. In addition to the stack specified here, an additional FC ' "layer will be forked off at the top for each 'c' and 'h' state in the RNN Im2LatexDecoderRNN state." "Hence, this is a 'multi-headed' MLP because it has multiple top-layers." "By default their implementation has num_hidden_layers==0 (i.e. n_layers_init==1).", instanceof(MLPParams), ## Value set dynamically inside self._trickledown() ), PD( 'init_model_final_layers', '', instanceof(FCLayerParams), ## Value set dynamically inside self._trickledown() ), ### Loss / Cost Layer ###decoder_lstm PD( 'sum_logloss', 'Whether to normalize log-loss per sample as in standard log perplexity ' 'calculation or whether to just sum up log-losses as in the paper. Defaults' 'to True in conformance with the paper.', boolean, # True ), PD( 'MeanSumAlphaEquals1', '(boolean): When calculating the alpha penalty, the paper uses the term: ' 'square{1 - sum_over_t{alpha_t_i}}). This assumes that the mean sum_over_t should be 1. ' "However, that's not true, since the mean of sum_over_t term should be C/L. This " "variable if set to True, causes the term to change to square{C/L - sum_over_t{alpha_t_i}}). " "The default value is True in conformance with the paper.", boolean, # True ), PD( 'pLambda', 'Lambda value for alpha penalty, Setting this to zero turns off alpha_penalty.', (0.0, 0.0005, 0.005, 0.0001, 0.05), # LambdaVal(lambda _, p: 0.005 if p.build_scanning_RNN else 0.000) ), # default in the show-and-tell paper is .00001? PD( 'target_aae', """ Target mean_norm_AAE value to shoot for. Varies with data-set. Value discovered by experimentation. """, (0., 51.42, 51.79), # LambdaVal(lambda _, p: None if (p.pLambda == 0) else 51.42) ), PD( 'target_ase', """ Target mean_norm_ASE value to shoot for. Varies with data-set. Value discovered by experimentation. """, (0.0, 5.27, 5.35, 10.0), # LambdaVal(lambda _, p: None if (p.pLambda == 0) else (10.0 if p.build_scanning_RNN else 5.27)) LambdaVal(lambda _, p: None if (p.pLambda == 0) else 5.27)), PD( 'k', 'Number of top-scoring beams to consider for best-of-k metrics.', integer(1), # Value specified in run.py )) def __init__(self, initVals): dlc.HyperParams.__init__(self, self.proto, initVals, seal=False) self._trickledown() def _trickledown(self): """ Trickle changes down to dependant parameters in sub-tree(s). (For same level dependencies use LambdaFunctions instead.) Call at the end of __init__ and end of update. """ ######## Output Model ######## if self.output_reuse_embeddings: assert not self.build_scanning_RNN, 'Scanning RNN cannot reuse-embeddings because there are no embeddings' self.output_first_layer = FCLayerParams(self).updated({ 'num_units': self.m, 'activation_fn': tf.nn.tanh, # Shouldn't this be None? # dropout imported from outer scope }).freeze() self.output_layers = MLPParams(self).updated({ # One layer with num_units = m is added if output_reuse_embeddings == True 'op_name': 'yLogits_MLP', # dropout imported from outer scope 'layers': ( ## paper has activation set to relu for all but the softmax layer ## paper has all hidden layers with num_units = m. # TODO: num_units should probably be self.K otherwise the model is a reverse pyramid FCLayerParams(self).updated({ 'num_units': 64, 'activation_fn': tf.nn.relu }).freeze(), ## Last layer must have num_units = K and activation_fn=None because it outputs logits. FCLayerParams(self).updated({ 'num_units': self.K, 'activation_fn': None, 'dropout': None }).freeze(), ) }).freeze() else: self.output_layers = MLPParams(self).updated({ 'op_name': 'yLogits_MLP', 'layers': ( # paper has activation set to relu for all but the softmax layer # paper has all hidden layers with num_units = m. FCLayerParams(self).updated({ 'num_units': 358, 'activation_fn': tf.nn.tanh }).freeze(), FCLayerParams(self).updated({ 'num_units': 358, 'activation_fn': tf.nn.tanh }).freeze(), # Last layer must have num_units = K and activation_fn=None because it outputs logits. FCLayerParams(self).updated({ 'num_units': self.K, 'activation_fn': None, 'dropout': None }).freeze(), ) }).freeze() assert self.output_layers.layers[-2].num_units >= self.K assert self.output_layers.layers[ -1].activation_fn == None, 'The last layer must have linear activation because softmax is added later (since we need logits for efficient cross-entropy calculation)' if (not self.output_reuse_embeddings): assert len( self.output_layers.layers ) >= 2, "Need one hidden layer at least to match the paper's complexity." ######## Init Model ######## if self.build_init_model: # Note: There are no hidden init layers by default in the Show&Tell paper self.init_model_hidden = MLPParams(self).updated({ 'layers': ( # Show&Tell paper sets hidden activations=relu # The Show&Tell paper's source sets all hidden units to D FCLayerParams(self).updated({ 'num_units': min(self.D, 100), 'activation_fn': tf.nn.tanh }).freeze(), ) }).freeze() self.init_model_final_layers = FCLayerParams(self).updated({ # Show&Tell paper sets final=tanh 'activation_fn': tf.nn.tanh, 'dropout': None }).freeze() def __copy__(self): ## Shallow copy return self.__class__(self) def copy(self, override_vals={}): ## Shallow copy return self.__class__(self).updated(override_vals)