def __init__(self, ntoken, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1], dropout=0.0, dropouti=0.0, dropouth=0.0, dropoutl=0.0, emb_dropout=0.0, wdrop=0.0, temporalwdrop=True, tie_weights=True, repack=False, wnorm=True, aux=True, aux_frequency=20, n_experts=0): """ A deep sequence model based on TrellisNet :param ntoken: The number of unique tokens :param ninp: The input dimension :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600. (The reason we want to separate this is from Theorem 1.) :param nout: The output dimension :param nlevels: The number of TrellisNet layers :param kernel_size: Kernel size of the TrellisNet :param dilation: Dilation size of the TrellisNet :param dropout: Output (variational) dropout :param dropouti: Input (variational) dropout :param dropouth: Hidden-to-hidden (VD-based) dropout :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used) :param emb_dropout: Embedding dropout :param wdrop: Weight dropout :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0) :param tie_weights: Whether to tie the encoder and decoder weights :param repack: Whether to use history repackaging for TrellisNet :param wnorm: Whether to apply weight normalization :param aux: Whether to use auxiliary loss (deep supervision) :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True) :param n_experts: The number of softmax "experts" (i.e., whether MoS is used) """ super(TrellisNetModel, self).__init__() self.emb_dropout = emb_dropout self.dropout = dropout # Rate for dropping eventual output self.dropouti = dropouti # Rate for dropping embedding output self.dropoutl = dropoutl self.var_drop = VariationalDropout() self.encoder = nn.Embedding(ntoken, ninp) self.repack = repack self.nout = nout self.nhid = nhid self.ninp = ninp self.aux = aux self.n_experts = n_experts network = TrellisNet self.network = network(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size, dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation) # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale" reg_term = '_v' if wnorm else '' self.network = WeightDrop(self.network, [['full_conv', 'weight1' + reg_term], ['full_conv', 'weight2' + reg_term]], dropout=wdrop, temporal=temporalwdrop) self.decoder = nn.Linear(nhid, ntoken) self.network = nn.ModuleList([self.network]) self.init_weights() if tie_weights: if nout != ninp and self.n_experts == 0: raise ValueError( 'When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight if n_experts > 0: print("Applied Mixture of Softmax") self.mixsoft = MixSoftmax(n_experts, ntoken, nlasthid=nout, ninp=ninp, decoder=self.decoder, dropoutl=dropoutl) self.network.append(self.mixsoft)
def __init__(self, track_weight, ntoken, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1], dropout=0.0, dropouti=0.0, dropouth=0.0, dropoutl=0.0, emb_dropout=0.0, wdrop=0.0, temporalwdrop=True, tie_weights=True, repack=False, wnorm=True, aux=True, aux_frequency=20, n_experts=0, load=""): """ A deep sequence model based on TrellisNet :param ntoken: The number of unique tokens :param ninp: The input dimension :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600. (The reason we want to separate this is from Theorem 1.) :param nout: The output dimension :param nlevels: The number of TrellisNet layers :param kernel_size: Kernel size of the TrellisNet :param dilation: Dilation size of the TrellisNet :param dropout: Output (variational) dropout :param dropouti: Input (variational) dropout :param dropouth: Hidden-to-hidden (VD-based) dropout :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used) :param emb_dropout: Embedding dropout :param wdrop: Weight dropout :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0) :param tie_weights: Whether to tie the encoder and decoder weights :param repack: Whether to use history repackaging for TrellisNet :param wnorm: Whether to apply weight normalization :param aux: Whether to use auxiliary loss (deep supervision) :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True) :param n_experts: The number of softmax "experts" (i.e., whether MoS is used) :param load: The path to the pickled weight file (the weights/biases should be in numpy format) """ super(TrellisNetModel, self).__init__() self.emb_dropout = emb_dropout self.dropout = dropout # Rate for dropping eventual output self.dropouti = dropouti # Rate for dropping embedding output self.dropoutl = dropoutl self.var_drop = VariationalDropout() self.repack = repack self.nout = nout self.nhid = nhid self.ninp = ninp self.aux = aux self.n_experts = n_experts self.tie_weights = tie_weights self.wnorm = wnorm # 1) Set up encoder and decoder (embeddings) self.encoder = nn.Embedding.from_pretrained(track_weight, freeze=True) self.decoder = nn.Linear(nhid, nout) self.init_weights() if tie_weights: if nout != ninp and self.n_experts == 0: raise ValueError('When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight # 2) Set up TrellisNet tnet = TrellisNet self.tnet = tnet(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size, dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation) # 3) Set up MoS, if needed if n_experts > 0: print("Applied Mixture of Softmax") self.mixsoft = MixSoftmax(n_experts, ntoken, nlasthid=nout, ninp=ninp, decoder=self.decoder, dropoutl=dropoutl) # 4) Apply weight drop connect. If weightnorm is used, we apply the dropout to its "direction", instead of "scale" reg_term = '_v' if wnorm else '' self.tnet = WeightDrop(self.tnet, [['full_conv', 'weight1' + reg_term], ['full_conv', 'weight2' + reg_term]], dropout=wdrop, temporal=temporalwdrop) self.network = nn.ModuleList([self.tnet]) if n_experts > 0: self.network.append(self.mixsoft) # 5) Load model, if path specified if len(load) > 0: params_dict = torch.load(open(load, 'rb')) self.load_weights(params_dict) print("Model loaded successfully from {0}".format(load))
class TrellisNetModel(nn.Module): def __init__(self, ntoken, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1], dropout=0.0, dropouti=0.0, dropouth=0.0, dropoutl=0.0, emb_dropout=0.0, wdrop=0.0, temporalwdrop=True, tie_weights=True, repack=False, wnorm=True, aux=True, aux_frequency=20, n_experts=0): """ A deep sequence model based on TrellisNet :param ntoken: The number of unique tokens :param ninp: The input dimension :param nhid: The hidden unit dimension (excluding the output dimension). In other words, if you want to build a TrellisNet with hidden size 1000 and output size 400, you should set nhid = 1000-400 = 600. (The reason we want to separate this is from Theorem 1.) :param nout: The output dimension :param nlevels: The number of TrellisNet layers :param kernel_size: Kernel size of the TrellisNet :param dilation: Dilation size of the TrellisNet :param dropout: Output (variational) dropout :param dropouti: Input (variational) dropout :param dropouth: Hidden-to-hidden (VD-based) dropout :param dropoutl: Mixture-of-Softmax dropout (only valid if MoS is used) :param emb_dropout: Embedding dropout :param wdrop: Weight dropout :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0) :param tie_weights: Whether to tie the encoder and decoder weights :param repack: Whether to use history repackaging for TrellisNet :param wnorm: Whether to apply weight normalization :param aux: Whether to use auxiliary loss (deep supervision) :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True) :param n_experts: The number of softmax "experts" (i.e., whether MoS is used) """ super(TrellisNetModel, self).__init__() self.emb_dropout = emb_dropout self.dropout = dropout # Rate for dropping eventual output self.dropouti = dropouti # Rate for dropping embedding output self.dropoutl = dropoutl self.var_drop = VariationalDropout() self.encoder = nn.Embedding(ntoken, ninp) self.repack = repack self.nout = nout self.nhid = nhid self.ninp = ninp self.aux = aux self.n_experts = n_experts network = TrellisNet self.network = network(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size, dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation) # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale" reg_term = '_v' if wnorm else '' self.network = WeightDrop(self.network, [['full_conv', 'weight1' + reg_term], ['full_conv', 'weight2' + reg_term]], dropout=wdrop, temporal=temporalwdrop) self.decoder = nn.Linear(nhid, ntoken) self.network = nn.ModuleList([self.network]) self.init_weights() if tie_weights: if nout != ninp and self.n_experts == 0: raise ValueError( 'When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight if n_experts > 0: print("Applied Mixture of Softmax") self.mixsoft = MixSoftmax(n_experts, ntoken, nlasthid=nout, ninp=ninp, decoder=self.decoder, dropoutl=dropoutl) self.network.append(self.mixsoft) def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.fill_(0) self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, input, hidden, decode=True): """ Execute the forward pass of the deep network :param input: The input sequence, with dimesion (N, L) :param hidden: The initial hidden state (h, c) :param decode: Whether to use decoder :return: The predicted sequence """ emb = embedded_dropout(self.encoder, input, self.emb_dropout if self.training else 0) emb = self.var_drop(emb, self.dropouti) emb = emb.transpose(1, 2) trellisnet = self.network[0] raw_output, hidden, all_raw_outputs = trellisnet(emb, hidden) output = self.var_drop(raw_output, self.dropout) all_outputs = self.var_drop( all_raw_outputs, self.dropout, dim=4) if self.aux else None # N x M x L x C decoded, all_decoded = None, None if self.n_experts > 0 and not decode: raise ValueError( "Mixture of softmax involves decoding phase. Must set decode=True" ) if self.n_experts > 0: decoded = torch.log(self.mixsoft(output).add_(1e-8)) all_decoded = torch.log( self.mixsoft(all_outputs).add_(1e-8)) if self.aux else None if decode: decoded = decoded if self.n_experts > 0 else self.decoder(output) if self.aux: all_decoded = all_decoded if self.n_experts > 0 else self.decoder( all_outputs) # N x M x L x C return (raw_output, output, decoded), hidden, all_decoded return (raw_output, output, output), hidden, all_outputs def init_hidden(self, bsz): h_size = self.nhid + self.nout weight = next(self.parameters()).data return (Variable(weight.new(bsz, h_size, 1).zero_()), Variable(weight.new(bsz, h_size, 1).zero_()))
def __init__(self, ninp, nhid, nout, nlevels, kernel_size=2, dilation=[1], dropout=0.0, dropouti=0.0, dropouth=0.0, wdrop=0.0, temporalwdrop=True, wnorm=True, aux=False, aux_frequency=1e4): """ A sequence model using TrellisNet (on sequential MNIST & CIFAR-10). Note that this is different from the models in other tasks (e.g. word-level PTB) because: 1) there is no more embedding; 2) we only need one output at the end for classification of the pixel stream; and 3) the input and output features are very low-dimensional (e.g., 3 channels). :param ninp: The number of input channels of the pixels :param nhid: The number of hidden units in TrellisNet (excluding the output size) :param nout: The number of output channels (which should agree with the number of classes) :param nlevels: The number of TrellisNet layers :param kernel_size: Kernel size of the TrellisNet :param dilation: Dilation size of the TrellisNet :param dropout: Output dropout :param dropouti: Input dropout :param dropouth: Hidden-to-hidden (VD-based) dropout :param wdrop: Weight dropout :param temporalwdrop: Whether we drop only the temporal parts of the weight (only valid if wdrop > 0) :param wnorm: Whether to apply weight normalization :param aux: Whether to use auxiliary loss (deep supervision) :param aux_frequency: The frequency of the auxiliary loss (only valid if aux == True) """ super(TrellisNetModel, self).__init__() self.nout = nout # Should be the number of classes self.nhid = nhid self.dropout = dropout self.dropouti = dropouti self.aux = aux network = TrellisNet self.network = network(ninp, nhid, nout=nout, nlevels=nlevels, kernel_size=kernel_size, dropouth=dropouth, wnorm=wnorm, aux_frequency=aux_frequency, dilation=dilation) reg_term = '_v' if wnorm else '' self.network = WeightDrop(self.network, [['full_conv', 'weight1' + reg_term], ['full_conv', 'weight2' + reg_term]], dropout=wdrop, temporal=temporalwdrop) # If weight normalization is used, we apply the weight dropout to its "direction", instead of "scale" self.linear = nn.Linear(nout, nout) self.network = nn.ModuleList([self.network])