def argument(): arg = optimize.argument() arg.dropout_type = 'vanilla' # 'vanilla', 'broadcast', 'alpha' arg.ffd = 'transformer_ffd' # 'transformer_ffd' 'sru' 'sepconv' arg.loss = 'sparse_softmax_cross_entropy_with_logits' arg.pos = 'timing' # 'timing' 'emb' 'linear_stop' 'tanh_stop' 'exp_stop' arg.decoder_layers = 4 # the number of decoder layers arg.encoder_layers = 4 # the number of encoder layers arg.filter_size = 1024 # the filter size arg.head_size = 64 # the size of each head in the attention mechanisms arg.hidden_size = 256 # the hidden size arg.input_max_length = 10 # the maximum sequence size of the input, for the 'emb' pos arg.input_vocab_size = 1000 # the vocab size for the input arg.label_smoothing = 1.0 # the hyperparameter for label smoothing arg.max_relative_position = 100 # max relative position for relative attention arg.num_heads = 8 # the number of heads for the attention mechanisms arg.target_max_length = 10 # the maximum sequence size of the output, for the 'emb' pos arg.target_vocab_size = 1000 # the vocab size for the targets arg.weight_decay_hyperparameter = 0.001 # the hyperparameter for weight decay arg.adaptive_mask = False # whether adaptive mask is used arg.classification = False # whether the final output is a sequence, or single label arg.deparameterize = False # KEEP AS FALSE arg.dynamic_attention_span = False # KEEP AS FALSE arg.mask_loss = False # whether parts of the loss is masked arg.relative_attention = False # whether to use relative attention arg.unidirectional_decoder = True # whether the decoder is unidirectional arg.unidirectional_encoder = False # whether the encoder is unidirectional arg.use_decoder = True # whether to use the decoder arg.use_mos = False # whether to use an MoS arg.use_relu = True # whether the activation functions are ReLU or GELU arg.weight_decay_regularization = False # whether to use weight decay return arg
def argument(): arg = optimize.argument() arg.dropout_type = 'vanilla' # 'vanilla', 'broadcast', 'alpha' arg.ffd = 'transformer_ffd' # 'transformer_ffd' 'sru' 'sepconv' arg.loss = 'sparse_softmax_cross_entropy_with_logits' arg.pos = 'timing' # 'timing' 'emb' 'linear_stop' 'tanh_stop' 'exp_stop' arg.act_epsilon = 0.001 # a hyperparameter that the ACT mechanism uses arg.act_loss_weight = 0.01 # a hyperparameter specifing the auxillary ACT loss in comparison to the total model weight arg.filter_size = 1024 # the filter size arg.head_size = 64 # the size of each head arg.hidden_size = 256 # the hidden size of each model arg.input_max_length = 10 # the maximum size of the input sequence size arg.input_vocab_size = 1000 # the input vocab size arg.label_smoothing = 1.0 # the label smoothing hyperparameter arg.max_encoder_steps = 8 # the maximum number of encoder layers arg.max_decoder_steps = 8 # the maximum number of decoder layers arg.max_relative_position = 100 # used for relative attention arg.num_heads = 8 # the number of heads in a self-attention mechanism arg.target_max_length = 10 # the maximum size of the target sequence size arg.target_vocab_size = 1000 # the target vocab size arg.weight_decay_hyperparameter = 0.001 # the weight decay hyperparameter arg.classification = False # whether the final output is a sequence, or single label arg.deparameterize = False # KEEP AS FALSE arg.mask_loss = True # whether parts of the loss is masked arg.relative_attention = False # whether to use relative attention arg.unidirectional_decoder = True # whether the decoder is unidirectional arg.unidirectional_encoder = False # whether the encoder is unidirectional arg.use_act = False # whether the Universal Transformer uses an ACT mechanism arg.use_decoder = True # whether to use the decoder arg.use_mos = False # whether to use an MoS arg.use_relu = True # whether the activation functions are ReLU or GELU arg.weight_decay_regularization = False # whether to use weight decay return arg
def argument(): arg = optimize.argument() arg.dropout_type = 'vanilla' arg.embed_dim = 24 arg.kernel_height = 1 arg.kernel_width = 3 arg.layer = 2 arg.vocab_size = 20 arg.width = 3 return arg
def argument(): arg = optimize.argument() arg.cell = 'gru' # whether to use a GRU or LSTM model arg.loss = 'sparse_softmax_cross_entropy_with_logits' arg.hidden_dim = 256 # the hidden size of the model arg.label_smoothing = 1.0 # the hyperparameter for smoothing labels arg.layers = 2 # the number of layers for RNN arg.input_vocab_size = 1000 # the vocab size of the input sequence arg.target_vocab_size = 1000 # the target size of the target sequence arg.weight_decay_hyperparameter = 0.001 # the hyperparameter for weight decay arg.classification = True # whether the output is a sequence or a single token arg.mask_loss = True # whether to mask parts of the loss arg.unidirectional = True # whether the anaylsis is strictly unidirectional arg.weight_decay_regularization = False # whether weight decay is used arg.hidden_size = arg.hidden_dim return arg
def argument(): arg = optimize.argument() arg.cell = 'gru' # the type of RNN cell. Either GRU or LSTM arg.dropout_type = 'vanilla' # dropout type is set to vanilla. There is not SELU activation function, so there is no reason to use alpha-dropout arg.loss = 'sparse_softmax_cross_entropy_with_logits' # the loss function used arg.stop_feature = 'linear' # the stop_feature used. 'linear' 'tanh' 'exp' 'none' arg.gamma = 0.1 # the gamma used for the stop-feature arg.hidden_dim = 128 # the hidden size arg.input_vocab_size = 83 # the input vocab size arg.label_smoothing = 1.0 # the label smoothing hyperparameter arg.layers = 2 # the number of RNN layers arg.target_vocab_size = 120 # the target vocab size arg.weight_decay_hyperparameter = 0.001 # the weight decay hyperparameter arg.mask_loss = True # whether parts of the loss is masked arg.use_attention = True # whether the output RNN cells use an attention mechanism arg.weight_decay_regularization = False # whether weight decay is used return arg