def __init__(self, input_dim=None, hidden_dim=200, output_dim=100, batch_size=1, p_dropout=0.2, num_layers=2): super(LSTMModel, self).__init__() self.hidden_dim = hidden_dim self.batch_size = batch_size self.bidirectional = True self.num_layers = num_layers self.bidir_mult = 2 if self.bidirectional else 1 self.dimension_mult = self.num_layers * self.bidir_mult # The LSTM takes sequences of spectrograms/MFCCs as inputs, and outputs hidden states # with dimensionality hidden_dim. self.lstm = to_gpu( nn.LSTM(input_dim, hidden_dim, bidirectional=self.bidirectional, num_layers=self.num_layers, dropout=p_dropout)) self.dropout_1 = nn.Dropout(p_dropout) # The linear layer that maps from hidden state space to tag space self.hidden2tag = to_gpu( nn.Linear(hidden_dim * self.bidir_mult, output_dim)) self.reset_hidden()
def predict(self, obs, action=None, remember_step=True): #obs_orig = obs try: # past actions come in as numpy obs = to_gpu(torch.from_numpy(obs)) except: pass # None actions for the first step phi = self.network.phi_body(obs, remember_step=remember_step) phi_a = self.network.actor_body(phi, remember_step=remember_step) phi_v = self.network.critic_body(phi, remember_step=remember_step) logits = self.network.fc_action(phi_a) v = self.network.fc_critic(phi_v) if self.mask_gen is not None: if not remember_step: # mask_gen is stateful, so need to copy it if don't want to remember the step tmp_mask_gen = copy.deepcopy(self.mask_gen) mask = tmp_mask_gen(obs) else: mask = self.mask_gen(obs) logits = logits - to_gpu(torch.Tensor(1e6 * (1 - mask))) dist = torch.distributions.Categorical(logits=logits) if action is None: action = dist.sample().to(torch.int64) if 75 in action: pass log_prob = dist.log_prob(action).unsqueeze(-1) return action, log_prob, dist.entropy().unsqueeze(-1), v
def __getitem__(self, idx): if idx < self.batches: x = to_gpu(torch.randn(self.x_shape)) y = to_gpu(self.model(Variable(x)).data) return (x, y) else: raise StopIteration()
def __init__(self, model, output_dim, drop_rate=0.2): super().__init__() self.z_size = output_dim self.model = model #self.dropout = nn.Dropout(drop_rate) self.fc_mu = to_gpu(nn.Linear(self.model.output_shape[-1], output_dim)) self.fc_var = to_gpu(nn.Linear(self.model.output_shape[-1], output_dim)) self.fc_skew = to_gpu(nn.Linear(self.model.output_shape[-1], output_dim)) self.output_shape = [None, output_dim]
def scoring_fun(x): if isinstance(x, tuple) or isinstance(x, list): x = {'actions': x[0], 'smiles': x[1]} out_x = to_gpu(x['actions']) end_of_slice = randint(3, out_x.size()[1]) #TODO inject random slicing back out_x = out_x[:, 0:end_of_slice] smiles = x['smiles'] scores = to_gpu( torch.from_numpy(property_scorer(smiles).astype(np.float32))) return out_x, scores
def to_pytorch(x): if 'ndarray' in str(type(x)): if 'bool' in str(type(x[0])): x = np.array([1.0 if xi else 0.0 for xi in x]).astype(np.float32) return to_gpu(torch.from_numpy(x)).to(torch.float32) else: if 'float' in str(type(x[0])): return to_gpu(torch.from_numpy(x)).to(torch.float32) else: return to_gpu(torch.from_numpy(x)) else: return x
def __init__( self, num_actions, max_seq_len, n_layers=6, #6 n_head=6, #8, d_k=16, #64, d_v=16, #64, d_model=128, #512, d_inner_hid=256, #1024, drop_rate=0.1, enc_output_size=76, batch_size=None): super().__init__() # TODO: properly integrate this with the parent n_position = max_seq_len + 1 # Why the +1? Because of the dummy prev action for first step self.max_seq_len = max_seq_len self.d_model = d_model self.enc_output_size = enc_output_size self.batch_size = batch_size self.position_enc = nn.Embedding(n_position, d_model, padding_idx=Constants.PAD) self.position_enc.weight.data = position_encoding_init( n_position, d_model) self.position_enc.weight.requires_grad = False # will this suffice to make them not trainable? # TODO: do we want relu after embedding? Probably not; make consistent self.embedder = nn.Embedding( num_actions, d_model, padding_idx=num_actions - 1) # Assume the padding index is the max possible? self.dropout = nn.Dropout(drop_rate) self.layer_stack = nn.ModuleList([ DecoderLayerStep(d_model, d_inner_hid, n_head, d_k, d_v, dropout=drop_rate) for _ in range(n_layers) ]) # make sure encoder output has correct dim if enc_output_size != self.d_model: self.enc_output_transform = to_gpu( nn.Linear(enc_output_size, self.d_model)) else: self.enc_output_transform = lambda x: x self.dec_output_transform = to_gpu(nn.Linear(self.d_model, num_actions)) self.all_actions = None self.output_shape = [None, self.max_seq_len, num_actions]
def init_hidden(self, batch_size=None): if batch_size is None: batch_size = self.batch_size # Before we've done anything, we dont have any hidden state. # Refer to the Pytorch documentation to see exactly # why they have this dimensionality. # The axes semantics are (num_layers, minibatch_size, hidden_dim) return (autograd.Variable( to_gpu( torch.zeros(self.dimension_mult, batch_size, self.hidden_dim))), autograd.Variable( to_gpu( torch.zeros(self.dimension_mult, batch_size, self.hidden_dim))))
def a2c_sequence(name = 'a2c_sequence', task=None, body=None): config = Config() config.num_workers = batch_size # same thing as batch size config.task_fn = lambda: task config.optimizer_fn = lambda params: torch.optim.RMSprop(params, lr=0.0007) config.network_fn = lambda state_dim, action_dim: \ to_gpu(CategoricalActorCriticNet(state_dim, action_dim, body, gpu=0, mask_gen=mask_gen)) #config.policy_fn = SamplePolicy # not used config.state_normalizer = lambda x: x config.reward_normalizer = lambda x: x config.discount = 0.99 config.use_gae = False #TODO: for now, MUST be false as our RNN network isn't com config.gae_tau = 0.97 config.entropy_weight = 0.01 config.rollout_length = 5 config.gradient_clip = 0.5 config.logger = logging.getLogger()#get_logger(file_name='deep_rl_a2c', skip=True) config.logger.info('test') config.iteration_log_interval config.max_steps = 100000 dash_name = 'DeepRL' visdom = Dashboard(dash_name) run_iterations(MyA2CAgent(config), visdom, invalid_value=invalid_value)
def forward(self, src_seq, src_pos=None): ''' Embed the source sequence, with optional position specification :param src_seq: batch x num_steps long or batch x num_steps x src_vocab one-hot or float :param src_pos: batch x num_steps long, or None :return: ''' if isinstance(src_seq, tuple): predefined_emb = src_seq[1] src_seq = src_seq[0] src_seq, src_seq_for_masking = self.normalizer(src_seq) if src_seq.dtype == torch.int64: # indices of discrete actions enc_input = self.src_word_emb(src_seq) if self.include_predefined: enc_input = torch.cat([enc_input, predefined_emb], dim=2) enc_input = self.transform_to_d_model(enc_input) elif src_seq.dtype == torch.float32 and len(src_seq.size()) == 3: # if the input is continuous enc_input = src_seq if self.include_predefined: enc_input = torch.cat([enc_input, predefined_emb], dim=2) enc_input = self.transform_to_d_model(enc_input) # Position Encoding addition if self.encode_position: if src_pos is None: batch_size = src_seq.size()[0] seq_len = src_seq.size()[1] src_pos = to_gpu(torch.arange(seq_len).unsqueeze(0).expand(batch_size,seq_len).type(LongTensor)) enc_input += self.position_enc(src_pos) return enc_input, src_seq_for_masking
def forward(self, last_action=None, last_action_pos=None): ''' One step of the RNN model :param enc_output: batch x z_size, so don't support sequences :param last_action: batch of ints, all equaling None for first step :param last_action_pos: ignored, used by the attention decoder, here just to get the signature right :return: ''' if self.hidden is None: # first step after reset # need to do it here as batch size might be different for each sequence self.hidden = self.init_hidden(batch_size=self.batch_size) self.one_hot_action = to_gpu( torch.zeros(self.batch_size, self.output_feature_size)) encoded = self.encode(self.enc_output, last_action) # copy the latent state to length of sequence, instead of sampling inputs embedded = F.relu(self.fc_input(self.batch_norm(encoded))) \ .view(self.batch_size, 1, self.hidden_n) \ .repeat(1, self.max_seq_length, 1) embedded = self.dropout_1(embedded) # run the GRU on it out_3, self.hidden = self.gru_1(embedded, self.hidden) # tmp has dim (batch_size*seq_len)xhidden_n, so we can apply the linear transform to it tmp = self.dropout_2(out_3.contiguous().view(-1, self.hidden_n)) out = self.fc_out(tmp).view(self.batch_size, self.max_seq_length, self.output_feature_size) # just return the logits #self.hidden = None return out #, hidden_1
def encode(self, x): ''' :param x: a numpy array batch x seq x feature :return: ''' out, hidden = self.forward(to_gpu(Variable(FloatTensor(x)))) return out.data.cpu().numpy()
def __init__(self, model): ''' Wrapper for a continuous decoder that doesn't look at last action chosen, eg simple RNN :param model: ''' super().__init__() self.model = to_gpu(model) self.model.eval()
def to_variable(x): if type(x) == tuple: return tuple([to_variable(xi) for xi in x]) elif 'ndarray' in str(type(x)): return to_gpu(torch.from_numpy(x)) elif 'Variable' not in str(type(x)): return Variable(x) else: return x
def decode(self, z): ''' Converts a batch of latent space vectors into a batch of action ints :param z: batch x z_size :return: smiles: list(str) of len batch, actions: LongTensor batch_size x max_seq_len ''' actions, logits = self.decoder(to_gpu(z)) smiles = self.decode_from_actions(actions) return smiles, actions
def forward(self, last_action, *args, **kwargs): ''' One step of the RNN model :param last_action: batch of ints, all equaling None for first step :return: batch x feature_len zeros ''' self.register_step() if self.output_shape[0] is None: self.output_shape[0] = len(last_action) return self.dummy_fc(to_gpu(torch.zeros(*self.output_shape)))
def gen(): iter1 = iter(self.main_loader) iter2 = iter(self.valid_ds) iter3 = iter(self.invalid_ds) while True: # make sure we iterate fully over the first dataset, others will likely be shorter x1 = next(iter1).float() try: x2 = next(iter2).float() except StopIteration: iter2 = iter(self.valid_ds) x2 = next(iter2).float() try: x3 = next(iter3).float() except StopIteration: iter3 = iter(self.valid_ds) x3 = next(iter3).float() x = to_gpu(torch.cat([x1, x2, x3], dim=0)) y = to_gpu(torch.zeros([len(x), 1])) y[:(len(x1) + len(x2))] = 1 yield x, y
def __init__(self, config): ''' A new agent gets spawned for every new sequence, reusing the same network So this is where we need to init_encoder_output for the network, so it knows a new sequence has started :param config: the DeepRL config object ''' super().__init__(config) try: self.dummy_enc_output = to_gpu(torch.zeros(config.num_workers,5)) # 5 just because :) self.network.network.phi_body.model.init_encoder_output(self.dummy_enc_output) except: pass
def forward(self, last_action=None, last_action_pos=None, remember_step=True): ''' One step of the RNN model :param enc_output: batch x z_size, so don't support sequences :param last_action: batch of ints, all equaling None for first step :param last_action_pos: ignored, used by the attention decoder, here just to get the signature right :return: batch x steps x feature_len ''' # check we don't exceed max sequence length # TODO: use parent's method instead if self.n == self.max_seq_length: raise StopIteration() if remember_step: self.n += self.steps if self.enc_output is None: self.batch_size = len(last_action) self.enc_output = torch.zeros(self.batch_size, self.z_size, device=device) if self.hidden is None: # first step after reset # need to do it here as batch size might be different for each sequence self.hidden = self.init_hidden(batch_size=self.batch_size) self.one_hot_action = to_gpu(torch.zeros(self.batch_size, self.output_feature_size)) encoded = self.encode(self.enc_output, last_action) # copy the latent state to length of sequence, instead of sampling inputs embedded = F.relu(self.fc_input( encoded #self.layer_norm(encoded) \ #self.batch_norm(encoded) # we don't want to batch norm one-hot encoded actions! )) \ .view(self.batch_size, 1, self.hidden_n) \ .repeat(1, self.steps, 1) embedded =self.dropout_1(embedded) # run the GRU on i out_3, new_hidden = self.gru_1(embedded, self.hidden) if remember_step: self.hidden = new_hidden # don't need the linear mapping below as that'll be done by the relevant head # tmp has dim (batch_size*seq_len)xhidden_n, so we can apply the linear transform to it tmp = self.dropout_2(out_3.contiguous().view(-1, self.hidden_n)) out = self.fc_out(tmp).view(self.batch_size, self.steps, self.output_feature_size) return out
def __init__(self, encoder=None, decoder=None, sample_z=True, epsilon_std=0.01, z_size=None, return_mu_log_var=True): ''' Initialize the autoencoder :param encoder: A model mapping batches of one-hot sequences (batch x seq x num_actions) to batches of logits :param decoder: Model mapping latent z (batch x z_size) to batches of one-hot sequences, and corresponding logits :param sample_z: Whether to sample z = N(mu, std) or just take z=mu :param epsilon_std: Scaling factor for samling, low values help convergence https://github.com/mkusner/grammarVAE/issues/7 ''' super(VariationalAutoEncoderHead, self).__init__() self.sample_z = sample_z self.encoder = to_gpu(encoder) self.decoder = to_gpu(decoder) self.epsilon_std = epsilon_std # TODO: should I be using the multipleOutputHead instead? self.mu_var_layer = to_gpu(MeanVarianceSkewHead(self.encoder, z_size)) self.output_shape = [None, z_size] self.return_mu_log_var = return_mu_log_var
def get_encoder(feature_len=12, max_seq_length=15, cnn_encoder_params={ 'kernel_sizes': (2, 3, 4), 'filters': (2, 3, 4), 'dense_size': 100 }, drop_rate=0.0, encoder_type='cnn', rnn_encoder_hidden_n=200): if encoder_type == 'rnn': rnn_model = SimpleRNN(hidden_n=rnn_encoder_hidden_n, feature_len=feature_len, drop_rate=drop_rate) encoder = to_gpu( AttentionAggregatingHead(rnn_model, drop_rate=drop_rate)) elif encoder_type == 'cnn': encoder = to_gpu( SimpleCNNEncoder(params=cnn_encoder_params, max_seq_length=max_seq_length, feature_len=feature_len, drop_rate=drop_rate)) elif encoder_type == 'attention': encoder = to_gpu( AttentionAggregatingHead(TransformerEncoder( feature_len, max_seq_length, dropout=drop_rate, padding_idx=feature_len - 1), drop_rate=drop_rate)) else: raise NotImplementedError() return encoder
def concat(x): if type(x) in (list, tuple): if type(x[0]) == torch.Tensor: return to_gpu(torch.cat(x, dim=0)) elif type(x[0]) in (list, tuple): out = [] for i in x: out += i return out #return [concat(elem) for elem in zip(*x)] elif type(x[0]) in (dict, OrderedDict): raise NotImplementedError( "Can't handle loaders returning dicts yet") else: return x else: raise ValueError("Can only concatenate lists and tuples as yet")
def forward(self, input_seq, hidden=None): ''' :param input_seq: batch_size x seq_len x feature_len :param hidden: hidden state from earlier :return: batch_size x hidden_n ''' batch_size = input_seq.size()[0] if hidden is None: hidden = Variable(to_gpu( torch.zeros(self.dimension_mult, batch_size, self.hidden_n)), requires_grad=False) # self.init_hidden(batch_size) # run the GRU on it gru_out, hidden = self.gru_1(input_seq, hidden) out = self.normalize_output(self.dropout(F.relu(gru_out))) return out
def forward(self, last_action=None, last_action_pos=None, remember_step=True): ''' One step of the RNN model :param enc_output: batch x z_size, so don't support sequences :param last_action: batch of ints, all equaling None for first step :param last_action_pos: ignored, used by the attention decoder, here just to get the signature right :return: batch x steps x feature_len ''' # check we don't exceed max sequence length if self.n == self.max_seq_length: raise StopIteration() if remember_step: self.n += self.steps if self.one_hot_action is None: # first step after reset # need to do it here as batch size might be different for each sequence self.one_hot_action = to_gpu( torch.zeros(self.batch_size, self.output_feature_size)) encoded = self.encode(self.enc_output, last_action) # copy the latent state to length of sequence, instead of sampling inputs embedded = F.relu(self.fc_input( #self.batch_norm(encoded) encoded )) \ .view(self.batch_size, 1, self.hidden_n)# \ #.repeat(1, self.steps, 1) out = self.dropout_1(embedded) # run the GRUs on it for dec_layer in self.layer_stack: out = dec_layer(out, remember_step) # tmp has dim (batch_size*seq_len)xhidden_n, so we can apply the linear transform to it #tmp = self.dropout_2(out.contiguous().view(-1, self.hidden_n)) tmp = out.contiguous().view(-1, self.hidden_n) out = self.fc_out(tmp).view(self.batch_size, 1, self.output_feature_size) return out
def forward(self, inputs): ''' Calculates targets and values for simple deepQ-learning :param z: latent variable input for decoder, batch_size x z_size :param actions: History of actions, batch_size x max_len of ints :param sample_ind: Index we want to sample each sequence at, batch_size :param values: Value of objective function where sequence comleted, None otherwise : batch_size :return: values: value of action at sample_ind predicted by decoder value; targets: one-step Bellman equation target ''' actions = inputs batch_size = len(actions) orig_policy = self.decoder.policy self.decoder.policy = PolicyFromTarget(actions) # TODO: replace this ugly ugly hack! true_z_size = 56 # self.decoder.z_size _, logits = self.decoder.forward( to_gpu(torch.zeros(batch_size, true_z_size))) self.decoder.policy = orig_policy # todo: a better head, now values and policy too entangled value_est = logits #F.tanh(logits) return actions, logits, value_est
def __init__(self, stepper: Stepper, policy: SimplePolicy, task=None, mask_gen=None, batch_size=None): ''' A simple discrete decoder, alternating getting logits from model and actions from policy :param stepper: :param policy: choose an action from the logits, can be max, or random sample, or choose from pre-determined target sequence. Only depends on current logits + history, can't handle multi-step strategies like beam search :param mask_fun: takes in one-hot encoding of previous action (for now that's all we care about) :param task: environment that returns rewards and whether the episode is finished ''' super().__init__() self.stepper = to_gpu(stepper) self.policy = policy self.task = task self.bypass_actions = False # legacy # self.mask_gen = mask_gen self.output_shape = [None, None, self.stepper.output_shape[-1]] self.batch_size = batch_size
def forward(self, model_out, target_x): """gives the batch normalized Variational Error.""" model_out_x, mu, log_var = model_out batch_size = target_x.size()[0] seq_len = target_x.size()[1] z_size = mu.size()[1] model_out_x = F.softmax(model_out_x, dim=2) #following mkusner/grammarVAE BCE = seq_len * self.bce_loss(model_out_x, target_x) # this normalizer is for when we're not sampling so only have mus, not sigmas avg_mu = torch.sum(mu, dim=0) / batch_size var = torch.mm(mu.t(), mu) / batch_size var_err = var - Variable(to_gpu(torch.eye(z_size))) var_err = torch.tanh( var_err) * var_err # so it's ~ x^2 asymptotically, not x^4 mom_err = (avg_mu * avg_mu).sum() / z_size + var_err.sum() / (z_size * z_size) if self.sample_z: # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD_element = (1 + log_var - mu * mu - log_var.exp()) KLD = -0.5 * torch.mean(KLD_element) KLD_ = KLD.data.item() my_loss = BCE + self.reg_weight * KLD else: my_loss = BCE + self.reg_weight * mom_err KLD_ = 0 if not self.training: # ignore regularizers when computing validation loss my_loss = BCE self.metrics = OrderedDict([('BCE', BCE.data.item()), ('KLD', KLD_), ('ME', mom_err.data.item())]) return my_loss
def my_gen(): for _ in range(1000): yield to_gpu(torch.zeros(BATCH_SIZE, settings['z_size']))
def get_decoder( molecules=True, grammar=True, z_size=200, decoder_hidden_n=200, feature_len=12, # TODO: remove this max_seq_length=15, drop_rate=0.0, decoder_type='step', task=None, node_policy=None, rule_policy=None, reward_fun=lambda x: -1 * np.ones(len(x)), batch_size=None, priors=True): codec = get_codec(molecules, grammar, max_seq_length) if decoder_type == 'old': stepper = ResettingRNNDecoder(z_size=z_size, hidden_n=decoder_hidden_n, feature_len=codec.feature_len(), max_seq_length=max_seq_length, steps=max_seq_length, drop_rate=drop_rate) stepper = OneStepDecoderContinuous(stepper) elif 'graph' in decoder_type and decoder_type not in [ 'attn_graph', 'rnn_graph' ]: return get_node_decoder(grammar, max_seq_length, drop_rate, decoder_type, rule_policy, reward_fun, batch_size, priors) elif decoder_type in ['attn_graph', 'rnn_graph']: # deprecated assert 'hypergraph' in grammar, "Only the hypergraph grammar can be used with attn_graph decoder type" if 'attn' in decoder_type: encoder = GraphEncoder(grammar=codec.grammar, d_model=512, drop_rate=drop_rate, model_type='transformer') elif 'rnn' in decoder_type: encoder = GraphEncoder(grammar=codec.grammar, d_model=512, drop_rate=drop_rate, model_type='rnn') model = MultipleOutputHead( model=encoder, output_spec={ 'node': 1, # to be used to select next node to expand 'action': codec.feature_len() }, # to select the action for chosen node drop_rate=drop_rate) # don't support using this model in VAE-style models yet model.init_encoder_output = lambda x: None mask_gen = HypergraphMaskGenerator(max_len=max_seq_length, grammar=codec.grammar) mask_gen.priors = priors # bias=codec.grammar.get_log_frequencies()) if node_policy is None: node_policy = SoftmaxRandomSamplePolicy() if rule_policy is None: rule_policy = SoftmaxRandomSamplePolicy() if 'node' in decoder_type: stepper = GraphDecoderWithNodeSelection(model, node_policy=node_policy, rule_policy=rule_policy) env = GraphEnvironment(mask_gen, reward_fun=reward_fun, batch_size=batch_size) decoder = DecoderWithEnvironmentNew(stepper, env) else: stepper = GraphDecoder(model=model, mask_gen=mask_gen) decoder = to_gpu( SimpleDiscreteDecoderWithEnv(stepper, rule_policy, task=task, batch_size=batch_size)) return decoder, stepper else: if decoder_type == 'step': stepper = SimpleRNNDecoder(z_size=z_size, hidden_n=decoder_hidden_n, feature_len=codec.feature_len(), max_seq_length=max_seq_length, drop_rate=drop_rate, use_last_action=False) elif decoder_type == 'action': stepper = SimpleRNNDecoder( z_size=z_size, # + feature_len, hidden_n=decoder_hidden_n, feature_len=codec.feature_len(), max_seq_length=max_seq_length, drop_rate=drop_rate, use_last_action=True) elif decoder_type == 'action_resnet': stepper = ResNetRNNDecoder( z_size=z_size, # + feature_len, hidden_n=decoder_hidden_n, feature_len=codec.feature_len(), max_seq_length=max_seq_length, drop_rate=drop_rate, use_last_action=True) elif decoder_type == 'attention': stepper = SelfAttentionDecoderStep(num_actions=codec.feature_len(), max_seq_len=max_seq_length, drop_rate=drop_rate, enc_output_size=z_size) elif decoder_type == 'random': stepper = RandomDecoder(feature_len=codec.feature_len(), max_seq_length=max_seq_length) else: raise NotImplementedError('Unknown decoder type: ' + str(decoder_type)) if grammar is not False and '_graph' not in decoder_type: # add a masking layer mask_gen = get_codec(molecules, grammar, max_seq_length).mask_gen stepper = MaskingHead(stepper, mask_gen) policy = SoftmaxRandomSamplePolicy( ) # bias=codec.grammar.get_log_frequencies()) decoder = to_gpu( SimpleDiscreteDecoderWithEnv( stepper, policy, task=task, batch_size=batch_size)) # , bypass_actions=True)) return decoder, stepper
def init_hidden(self, batch_size): # NOTE: assume only 1 layer no bi-direction h1 = Variable(to_gpu(torch.zeros(1, batch_size, self.hidden_n)), requires_grad=False) return h1