class Recurrent(Layer): def __init__(self, units, length, stateful=False, *args, **kwargs): super(Recurrent, self).__init__(*args, **kwargs) self.units = units self.length = length self.output_dim = [length, units] self.stateful = stateful self.states = None if self.input_dim is not None: self.build(self.input_dim) @property def params(self): return list(self.layer.parameters()) def build(self, input_dim): self.input_dim = input_dim self.layer = TorchRecurrent(self.input_dim, self.units, self.length) def clear_states(self): self.states = None def forward(self, X): X = super(Recurrent, self).forward(X) if self.stateful and self.states is not None: outputs, self.states = self.layer.forward(X, self.states) else: outputs, self.states = self.layer.forward(X) return outputs
class IndRNN(Module): def __init__(self, hidden_size, *args, **kwargs): super().__init__() self.module = RNN(hidden_size=hidden_size, *args, **kwargs, nonlinearity='relu') # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN # I'm not sure what is going on here, this is what weight_drop does so I stick to it self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition # We need to register it in this module to make it work with weight dropout w_hh = FloatTensor(hidden_size).type_as(getattr(self.module, 'weight_hh_l0').data) w_hh.uniform_(-1, 1) getattr(self.module, 'bias_ih_l0').data.fill_(0) getattr(self.module, 'bias_hh_l0').data.fill_(0) self.register_parameter(name='weight_hh_l0', param=Parameter(w_hh)) del self.module._parameters['weight_hh_l0'] def widget_demagnetizer_y2k_edition(*args, **kwargs): # We need to replace flatten_parameters with a nothing function # It must be a function rather than a lambda as otherwise pickling explodes # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION! # (╯°□°)╯︵ ┻━┻ return def _setweights(self): w_hh = getattr(self, 'weight_hh_l0') w_hh = diag(w_hh) setattr(self.module, 'weight_hh_l0', w_hh) def forward(self, *args): self._setweights() return self.module.forward(*args)
def train_torch(self, X, y_true, batch_size, learning_rate, num_epochs, print_many, verbose): self.batch_size = batch_size progresses = { int(num_epochs // (100 / i)): i for i in range(1, 101, 1) } t0 = counter() durations = [] device = torch.device('cuda:0') rnn = RNN(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=1, nonlinearity='tanh', bias=True, batch_first=False).to(device) fc = FCLayer(self.hidden_dim, self.output_size, bias=True).to(device) params = [rnn.parameters(), fc.params()] optimizer = SGD(chain(*params), lr=learning_rate) for epoch in range(num_epochs): epoch_loss = 0 for i in range(self.max_iters): x_batch = X[i * self.batch_size:(i + 1) * self.batch_size] x_batch = np.array( [x_batch[:, step, :] for step in range(self.time_steps)]) y_true_batch = y_true[i * self.batch_size:(i + 1) * self.batch_size] batch_size_local = x_batch.shape[1] # convert to pytorch tensor y_true_batch = y_true_batch.astype(np.int64) y_true_batch = torch.tensor(y_true_batch, requires_grad=False).to(device) x_batch = x_batch.astype(np.float32) x_batch = torch.tensor(x_batch, requires_grad=True).to(device) # forward pass h_stack, h_last = rnn.forward(x_batch, hx=None) fc_out = fc.forward(h_last) log_y_pred = F.log_softmax(input=fc_out, dim=2) log_y_pred = log_y_pred.view(batch_size_local, self.output_size) loss = F.nll_loss(input=log_y_pred, target=y_true_batch, reduction='mean') # update gradient optimizer.zero_grad() loss.backward() epoch_loss += loss.item() optimizer.step() durations.append(counter() - t0) t0 = counter() if (print_many and epoch % 100 == 0) or (not print_many and epoch in progresses): print( f"after epoch: {epoch}, epoch_losses: {round(epoch_loss / self.max_iters, 3)}" ) if verbose > 0: avg_epoch_time = sum(durations) / len(durations) print("average epoch time:", round(avg_epoch_time, 3)) return avg_epoch_time