def _profile(): seq_length = 50 batchsize = 48 feature_dimension = 128 data_cpu = np.random.normal(0, 1, size=(batchsize, feature_dimension, seq_length)).astype(np.float32) data_gpu = cuda.to_gpu(data_cpu, gpu_device) # CPU layer = SRU(feature_dimension, feature_dimension) for _ in range(100): h_cpu, c_cpu = layer(data_cpu) layer.reset_state() # GPU (define-by-run) layer = NaiveSRU(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): h, c = layer(data_gpu) layer.reset_state() # GPU (CUDA Kernel) layer = SRU(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): h_gpu, c_gpu = layer(data_gpu) layer.reset_state() # GPU (PyTorch) with torch.cuda.device(gpu_device): from cuda_functional import SRU as PyTorchSRU data_gpu_torch = torch.FloatTensor(seq_length, batchsize, feature_dimension).cuda() rnn = PyTorchSRU(128, 128, num_layers=1, dropout=0.0, rnn_dropout=0.0, use_tanh=0, bidirectional=False) rnn.cuda() for _ in range(100): output, hidden = rnn(torch.autograd.Variable(data_gpu_torch)) # LSTM (Chainer) layer = links.LSTM(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): for t in range(seq_length): h = layer(data_gpu[..., t]) layer.reset_state() print(h_cpu) print(h_gpu)
def __init__(self, rnn_type, linear_size, rnn_inp_size, rnn_hid_size, dec_out_size, nlayers, dropout=0.5, tie_weights=False, res_connection=False): super(RNNPredictor, self).__init__() self.enc_input_size = linear_size self.drop = nn.Dropout(dropout) self.encoder = nn.Linear(linear_size, rnn_inp_size) if rnn_type in ['LSTM', 'GRU']: self.rnn = getattr(nn, rnn_type)(rnn_inp_size, rnn_hid_size, nlayers, dropout=dropout) elif rnn_type == 'SRU': from cuda_functional import SRU, SRUCell self.rnn = SRU(input_size=rnn_inp_size, hidden_size=rnn_hid_size, num_layers=nlayers, dropout=dropout, use_tanh=False, use_selu=True, layer_norm=True) else: try: nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] except KeyError: raise ValueError("""An invalid option for `--model` was supplied, options are ['LSTM', 'GRU', 'SRU', 'RNN_TANH' or 'RNN_RELU']""") self.rnn = nn.RNN(rnn_inp_size, rnn_hid_size, nlayers, nonlinearity=nonlinearity, dropout=dropout) self.decoder = nn.Linear(rnn_hid_size, dec_out_size) if tie_weights: if rnn_hid_size != rnn_inp_size: raise ValueError('When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.res_connection = res_connection self.init_weights() self.rnn_type = rnn_type self.rnn_hid_size = rnn_hid_size self.nlayers = nlayers
def __init__(self, args, de_vocab): super(Decoder, self).__init__() self.avg_num = args.average_size self.de_vocab = de_vocab self.args = args self.output_size = len(de_vocab) self.decoder_vocab_dense = nn.Linear(args.hidden_size, self.output_size) self.infer_vocab_dense = nn.Linear(args.hidden_size, self.output_size) self.input_drop = nn.Dropout(p=args.input_drop_out) self.output_drop = nn.Dropout(p=args.output_drop_out) self.decoder_embedding = nn.Embedding( len(de_vocab), args.embedding_size, padding_idx=de_vocab.stoi['<pad>']) if (args.rnn_cell == 'GRU'): self.decoder_rnn = nn.GRU(input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=args.layer_size, bias=True, dropout=args.drop_out) self.infer_rnn = nn.GRU(input_size=args.hidden_size, hidden_size=args.hidden_size, num_layers=args.layer_size, bias=True, dropout=args.drop_out) elif (args.rnn_cell == 'LSTM'): self.rnn = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=args.layer_size, bias=True, dropout=args.drop_out) elif (args.rnn_cell == 'SRU'): self.rnn = SRU( input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=6, # number of stacking RNN layers dropout=0.2, # dropout applied between RNN layers rnn_dropout= 0.2, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? use_selu=0, # use SeLU? bidirectional=False, # bidirectional RNN ? weight_norm=False, # apply weight normalization on parameters layer_norm= False, # apply layer normalization on the output of each layer # initial bias of highway gate (<= 0) highway_bias=0)
def __init__(self, args, en_vocab): super(Encoder, self).__init__() encoder_rnn_cell = 'SRU' self.dense_hidden = nn.Linear(2 * args.hidden_size, args.hidden_size) self.input_drop = nn.Dropout(p=args.input_drop_out) self.output_drop = nn.Dropout(p=args.output_drop_out) self.encoder_embedding = nn.Embedding( len(en_vocab), args.embedding_size, padding_idx=en_vocab.stoi['<pad>']) self.encoder_embedding.weight.data.copy_(en_vocab.vectors) if (encoder_rnn_cell == 'GRU'): self.rnn = nn.GRU(input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=args.layer_size, bias=True, dropout=args.drop_out, bidirectional=args.bidirectional) elif (encoder_rnn_cell == 'LSTM'): self.rnn = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=args.layer_size, bias=True, dropout=args.drop_out, bidirectional=args.bidirectional) elif (encoder_rnn_cell == 'SRU'): self.rnn = SRU( input_size=args.embedding_size, hidden_size=args.hidden_size, num_layers=args.layer_size, # number of stacking RNN layers dropout=args.drop_out, # dropout applied between RNN layers rnn_dropout= 0.2, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? use_selu=0, # use SeLU? bidirectional=True, # bidirectional RNN ? weight_norm=False, # apply weight normalization on parameters layer_norm= False, # apply layer normalization on the output of each layer # initial bias of highway gate (<= 0) highway_bias=0)
def __init__(self, in_dim=118, out_dim=118, num_hidden=2, hidden_dim=256, bidirectional=False, dropout=0, last_sigmoid=False, use_relu=0, rnn_dropout=0.0): super(SRURNN, self).__init__() from cuda_functional import SRU self.num_direction = 2 if bidirectional else 1 self.gru = SRU(in_dim, hidden_dim, num_hidden, bidirectional=bidirectional, dropout=dropout, use_relu=use_relu, rnn_dropout=rnn_dropout) self.hidden2out = nn.Linear(hidden_dim * self.num_direction, out_dim) self.sigmoid = nn.Sigmoid() self.last_sigmoid = last_sigmoid
torch.cuda.synchronize() print("{:.4} sec\n".format(time.time() - start)) N = 100 input_size, hidden_size = 512, 1024 num_layers = 2 batch_size = 128 * 4 length = 64 x = Variable(torch.randn(length, batch_size, input_size).float(), volatile=True) x = x.cuda() print("") # single gpu rnn = Model(SRU(input_size, hidden_size, num_layers)) rnn.cuda() rnn(x) print("\nSingle gpu:") run(x, rnn, N) # multiple gpu rnn_2 = Model(SRU(input_size, hidden_size, num_layers)) rnn_2 = nn.DataParallel(rnn_2, dim=1) rnn_2.cuda() rnn_2(x) print("\nMulti gpu:") run(x, rnn_2, N)