def SRUModel(embed_mat, MAX_LEN, num_cls, sru_sz=128): ip = Input(shape=(MAX_LEN, )) embed = Embedding(embed_mat.shape[0], embed_mat.shape[1], weights=[embed_mat], input_length=MAX_LEN, trainable=False) prev_input = embed(ip) hidden_states = [] depth = 2 if depth > 1: for i in range(depth - 1): h, h_final, c_final = SRU(sru_sz, dropout=0.0, recurrent_dropout=0.0, return_sequences=True, return_state=True, unroll=True)(prev_input) prev_input = h hidden_states.append(c_final) outputs = SRU(sru_sz, dropout=0.0, recurrent_dropout=0.0, unroll=True)(prev_input) outputs = Dense(num_cls, activation='softmax')(outputs) model = Model(ip, outputs) model.summary() # try using different optimizers and different optimizer configs model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def __init__(self, trg_vocab_size, with_ln=False, max_out=True): super(Decoder, self).__init__() self.max_out = max_out self.attention = Attention(wargs.dec_hid_size, wargs.align_size) self.trg_lookup_table = nn.Embedding(trg_vocab_size, wargs.trg_wemb_size, padding_idx=PAD) self.tanh = nn.Tanh() if wargs.dec_rnn_type == 'gru': self.gru1 = GRU(wargs.trg_wemb_size, wargs.dec_hid_size, with_ln=with_ln) self.gru2 = GRU(wargs.enc_hid_size, wargs.dec_hid_size, with_ln=with_ln) elif wargs.dec_rnn_type == 'sru': self.gru1 = SRU(input_size=wargs.trg_wemb_size, hidden_size=wargs.dec_hid_size, num_layers=wargs.dec_layer_cnt, dropout=0., bidirectional=False) self.gru2 = SRU(input_size=2*wargs.enc_hid_size, hidden_size=wargs.dec_hid_size, num_layers=wargs.dec_layer_cnt, dropout=0., bidirectional=False) out_size = 2 * wargs.out_size if max_out else wargs.out_size self.ls = nn.Linear(wargs.dec_hid_size, out_size) self.ly = nn.Linear(wargs.trg_wemb_size, out_size) self.lc = nn.Linear(2*wargs.enc_hid_size, out_size) self.classifier = Classifier(wargs.out_size, trg_vocab_size, self.trg_lookup_table if wargs.proj_share_weight is True else None)
def eval_imdb(): max_features = 20000 maxlen = 80 # cut texts after this number of words (among top max_features most common words) batch_size = 128 depth = 1 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') ip = Input(shape=(maxlen, )) embed = Embedding(max_features, 128)(ip) prev_input = embed hidden_states = [] if depth > 1: for i in range(depth - 1): h, h_final, c_final = SRU(128, dropout=0.0, recurrent_dropout=0.0, return_sequences=True, return_state=True, unroll=True)(prev_input) prev_input = h hidden_states.append(c_final) outputs = SRU(128, dropout=0.0, recurrent_dropout=0.0, unroll=True)(prev_input) outputs = Dense(1, activation='sigmoid')(outputs) model = Model(ip, outputs) model.summary() # try using different optimizers and different optimizer configs model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(x_train, y_train, batch_size=batch_size, epochs=100, validation_data=(x_test, y_test)) score, acc = model.evaluate(x_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc)
def __init__(self, dict_size, emb_size=1000, hid_size=1000, vis_size=2688, num_filters=10, mixed_size=1000, hid_mixed_size=1005, lang_layers=3, mixed_layers=3, backend='dpn92', mix_we=True, lstm=False, pretrained=True, extra=True, high_res=False): super().__init__() self.high_res = high_res self.vis_size = vis_size self.num_filters = num_filters if backend == 'dpn92': self.base = create_model(backend, 1, pretrained=pretrained, extra=extra) else: self.base = create_model(backend, 1, pretrained=pretrained) self.emb = nn.Embedding(dict_size, emb_size) self.lang_model = SRU(emb_size, hid_size, num_layers=lang_layers) if lstm: self.lang_model = nn.LSTM(emb_size, hid_size, num_layers=lang_layers) self.mix_we = mix_we lineal_in = hid_size + emb_size * int(mix_we) self.adaptative_filter = nn.Linear(in_features=lineal_in, out_features=(num_filters * (vis_size + 8))) self.comb_conv = nn.Conv2d(in_channels=(8 + emb_size + hid_size + vis_size + num_filters), out_channels=mixed_size, kernel_size=1, padding=0) self.mrnn = SRU(mixed_size, hid_mixed_size, num_layers=mixed_layers) if lstm: self.mrnn = nn.LSTM(mixed_size, hid_mixed_size, num_layers=mixed_layers) if not self.high_res: self.output_collapse = nn.Conv2d(in_channels=hid_mixed_size, out_channels=1, kernel_size=1)
def _profile(): seq_length = 50 batchsize = 48 feature_dimension = 128 data_cpu = np.random.normal(0, 1, size=(batchsize, feature_dimension, seq_length)).astype(np.float32) data_gpu = cuda.to_gpu(data_cpu, gpu_device) # CPU layer = SRU(feature_dimension, feature_dimension) for _ in range(100): h_cpu, c_cpu = layer(data_cpu) layer.reset_state() # GPU (define-by-run) layer = NaiveSRU(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): h, c = layer(data_gpu) layer.reset_state() # GPU (CUDA Kernel) layer = SRU(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): h_gpu, c_gpu = layer(data_gpu) layer.reset_state() # GPU (PyTorch) with torch.cuda.device(gpu_device): from cuda_functional import SRU as PyTorchSRU data_gpu_torch = torch.FloatTensor(seq_length, batchsize, feature_dimension).cuda() rnn = PyTorchSRU(128, 128, num_layers=1, dropout=0.0, rnn_dropout=0.0, use_tanh=0, bidirectional=False) rnn.cuda() for _ in range(100): output, hidden = rnn(torch.autograd.Variable(data_gpu_torch)) # LSTM (Chainer) layer = links.LSTM(feature_dimension, feature_dimension) layer.to_gpu(gpu_device) for _ in range(100): for t in range(seq_length): h = layer(data_gpu[..., t]) layer.reset_state() print(h_cpu) print(h_gpu)
def __init__(self, config: Dict): """ :param config: A dictionary containing the model and training configuration. """ super(Retrieval, self).__init__() #self.usecuda = config['cuda'] self.config = config self.device = torch.device('cuda') if config['cuda'] else torch.device( 'cpu') if config['bert']: self.context_bert = AutoModel.from_pretrained(config['bert_type']) self.response_bert = AutoModel.from_pretrained(config['bert_type']) bertsize = self.context_bert.config.hidden_size self.output_size = bertsize else: self.embedding_size = config['embedding_size'] self.hidden_size = config['hidden_size'] self.bidirectional = config['bidirectional'] self.num_layers = config['num_layers'] self.dropout = config['dropout'] self.rnn_dropout = config['rnn_dropout'] self.output_size = self.hidden_size * (1 + self.bidirectional) self.dropout_layer = nn.Dropout(self.dropout) self.context_rnn = SRU(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout, rnn_dropout=self.rnn_dropout, bidirectional=self.bidirectional, use_tanh=False, layer_norm=False, rescale=False) self.response_rnn = SRU(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout, rnn_dropout=self.rnn_dropout, bidirectional=self.bidirectional, use_tanh=False, layer_norm=False, rescale=False) if config['use_attention'] == 'True': self.context_attention = SelfAttentiveLayer( self.output_size, config) self.response_attention = SelfAttentiveLayer( self.output_size, config) if config['use_bilinear'] == 'True': self.map = nn.Sequential( nn.Linear(self.output_size, self.output_size), nn.Tanh())
def benchmark_chainer_sru(batchsize, seq_length, feature_dimension, repeat=50): layer = SRU(feature_dimension) x_data = np.random.normal(0, 1, size=(batchsize, feature_dimension, seq_length)).astype(np.float32) x_data = cuda.to_gpu(x_data) layer.to_gpu() with chainer.no_backprop_mode() and chainer.using_config("train", False): # forward start_time = time.time() for i in range(repeat): output, cell, last_cell = layer(x_data, None) forward_time_mean = (time.time() - start_time) / repeat with chainer.using_config("train", True): # backward start_time = time.time() for i in range(repeat): output, cell, last_cell = layer(x_data, None) layer.cleargrads() functions.sum(output).backward() backward_time_mean = (time.time() - start_time) / repeat return forward_time_mean, backward_time_mean
def __init__(self, nb_layer, dim_in, dim_out, dropout=0.25): super(SruEmb, self).__init__() self.dim_out = dim_out self.rnn = SRU(dim_in, dim_out, num_layers=nb_layer, dropout=dropout, rnn_dropout=dropout, use_tanh=True, has_skip_term=True, v1=True, rescale=False)
def __init__(self, in_dim=1024, hidden_dim=512, n_tags=11, num_layers=2, cell='gru'): super(RnnDecoder, self).__init__() if cell == 'gru': self.rnn = nn.GRU(input_size=in_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True) if cell == 'lstm': self.rnn = nn.LSTM(input_size=in_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True) elif cell == 'sru': from sru import SRU self.rnn = SRU(input_size=in_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True) self.out = nn.Sequential(nn.ReLU(), nn.Dropout(), nn.Linear(hidden_dim * 2, n_tags))
def __init__(self, context_len=21, in_dim=1024, out_dim=1024, num_layers=2, cell='gru'): super(RnnEncoder, self).__init__() self.hidden_dim = out_dim // 2 if cell == 'gru': self.rnn = nn.GRU(input_size=in_dim, hidden_size=self.hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True) if cell == 'lstm': self.rnn = nn.LSTM(input_size=in_dim, hidden_size=self.hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True) elif cell == 'sru': from sru import SRU self.rnn = SRU(input_size=in_dim, hidden_size=self.hidden_dim, num_layers=num_layers, dropout=0.5, bidirectional=True)
def __init__(self, img_dim=512, num_segments=12, hidden_size=1024, num_class=51): super(Recurrent_model, self).__init__() self.img_dim = img_dim self.num_segments = num_segments self.num_class = num_class self.rnn = SRU(img_dim, hidden_size, num_layers=3, dropout=0.5, bidirectional=False, layer_norm=False, highway_bias=0, rescale=True) # self.rnn = nn.LSTM(img_dim, hidden_size, # num_layers = 3, # dropout = 0.5, # bidirectional = False) # self.rnn = nn.GRU(img_dim, hidden_size, # num_layers = 3, # dropout = 0.5, # bidirectional = False) self.dropout = nn.Dropout() self.fc = nn.Linear(hidden_size, self.num_class)
def __init__(self, pretrained_path, vocab, K=620, d=2400, num_stack=4): super(TextEncoder, self).__init__() self.vocab = vocab self.embedding = self.create_emb_layer( self.load_dicts(pretrained_path), self.load_emb_params(pretrained_path), self.vocab, K) self.input_size = K self.hidden_size = d self.num_layers = num_stack # self.sru = nn.GRU(self.input_size, self.hidden_size, # num_layers = self.num_layers, # dropout = 0.25) self.sru = SRU( self.input_size, self.hidden_size, num_layers=self.num_layers, # number of stacking RNN layers rnn_dropout= 0.25, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? use_selu=0, # use SeLU? bidirectional=False, # bidirectional RNN ? weight_norm=False, # apply weight normalization on parameters layer_norm= False, # apply layer normalization on the output of each layer highway_bias=0 # initial bias of highway gate (<= 0) )
def __init__(self, src_vocab_size, input_size, output_size, bidirectional=False, with_ln=False, prefix='Encoder', **kwargs): super(Encoder, self).__init__() self.output_size = output_size f = lambda name: str_cat(prefix, name) # return 'Encoder_' + parameters name self.src_lookup_table = nn.Embedding(src_vocab_size, wargs.src_wemb_size, padding_idx=PAD) if wargs.enc_rnn_type == 'gru': self.forw_gru = GRU(input_size, output_size, with_ln=with_ln, prefix=f('Forw')) self.back_gru = GRU(output_size, output_size, with_ln=with_ln, prefix=f('Back')) elif wargs.enc_rnn_type == 'sru': self.rnn = SRU( input_size=input_size, hidden_size=output_size, num_layers=wargs.enc_layer_cnt, dropout=wargs.drop_rate, bidirectional=bidirectional)
def __init__(self, config: Dict): """ :param config: A dictionary containing the model and training configuration. """ super(FAQRetrieval, self).__init__() self.device = torch.device('cuda') if config['cuda'] else torch.device( 'cpu') self.embedding_size = config['embedding_size'] self.hidden_size = config['hidden_size'] self.bidirectional = config['bidirectional'] self.num_layers = config['num_layers'] self.dropout = config['dropout'] self.rnn_dropout = config['rnn_dropout'] self.output_size = self.hidden_size * (1 + self.bidirectional) self.dropout_layer = nn.Dropout(self.dropout) self.rnn = SRU(input_size=self.embedding_size, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout, rnn_dropout=self.rnn_dropout, bidirectional=self.bidirectional, use_tanh=False, layer_norm=False, rescale=False) self.config = config if self.config['use_attention']: self.attention = SelfAttentiveLayer(self.output_size, config) self.candi_mat = None
def __init__(self, embed_size, hidden_size, output_size, n_layers=1, padding_index=3, dropout=0.0, embedding_dropout=0.0): super(Decoder, self).__init__() self.embed_size = embed_size self.hidden_size = hidden_size self.output_size = output_size self.n_layers = n_layers self.dropout = dropout self.embedding_dropout = embedding_dropout self.embed = nn.Embedding(output_size, embed_size, padding_idx=padding_index) self.attention = Attention(hidden_size) self.gru = SRU(hidden_size + embed_size, hidden_size, num_layers=n_layers, layer_norm=True, dropout=dropout) # self.out = nn.Linear(hidden_size*2, output_size) self.out = nn.Linear(hidden_size, output_size)
def __init__(self, char_embedder=None, sru=True, char_embedder_params=None, model_params=None): super().__init__() if char_embedder is None: if char_embedder_params is None: char_embedder_params = default_char_embedder_params self.char_embedder = CharEmbedder(**char_embedder_params) else: self.char_embedder = char_embedder self.use_gpu = False self.sru = sru if self.sru: if model_params is None: model_params = default_sru_model_params self._language_model = SRU(input_size=model_params['output_dim'], hidden_size=model_params['output_dim'], use_tanh=True, num_layers=model_params['n_layers']) else: if model_params is None: model_params = default_lstm_model_params self._language_model = nn.LSTM( input_size=model_params['output_dim'], hidden_size=model_params['output_dim'], num_layers=model_params['n_layers'], batch_first=True) self.model_params = model_params
def __init__(self, args, dim_in): super(SruEmb, self).__init__() self.dim_out_rnn = args.dimemb self.dim_out = args.dimemb self.rnn = SRU(dim_in, self.dim_out_rnn, num_layers=args.sru, dropout=0.25, rnn_dropout=0.25, use_tanh=True) #self.rnn = nn.LSTM(dim_in, self.dim_out, num_layers=nb_layer, # dropout = dropout, bidirectional=True) self.attn_hop = args.attn_hop self.attn_hidden = args.attn_hidden self.ws1 = nn.Linear(self.dim_out, self.attn_hidden, bias=False) self.ws2 = nn.Linear(self.attn_hidden, self.attn_hop, bias=False) self.tanh = nn.Tanh() self.softmax = nn.Softmax(dim=2) #Number 1 is p = 0.25 self.drop = nn.Dropout(p=0.5) self.fc = nn.Linear(self.dim_out * self.attn_hop, self.dim_out, bias=True)
def test_gru_compatible_state_return(): N = 5 max_len = 7 V = 32 K = 8 K_out = 11 num_layers = 3 bidirectional = True print('N', N, 'max_len', max_len, 'num_layers', num_layers, 'bidirectional', bidirectional, 'K', K, 'K_out', K_out) torch.manual_seed(123) np.random.seed(123) lengths = torch.from_numpy(np.random.choice(max_len, N)) + 1 tensors = [torch.from_numpy(np.random.choice(V, l, replace=True)) for l in lengths.tolist()] embedder = nn.Embedding(V, K) tensors = nn.utils.rnn.pad_sequence(tensors) embedded = embedder(tensors) sru = SRU(K, K_out, nn_rnn_compatible_return=True, bidirectional=bidirectional, num_layers=num_layers) out, state = sru(embedded) print('out.size()', out.size()) print('state.size()', state.size()) gru = nn.GRU(K, K_out, bidirectional=bidirectional, num_layers=num_layers) gru_out, gru_state = gru(embedded) print('gru_state.size()', gru_state.size())
def __init__(self, input_size, embed_size, hidden_size, word2vec, n_layers=1, padding_index=3, dropout=0.0, embedding_dropout=0.0): super(Encoder, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.embed_size = embed_size self.wv = word2vec.wv self.gru = SRU(embed_size, hidden_size, num_layers=n_layers, bidirectional=True, layer_norm=True, dropout=dropout) self.linear = nn.Linear(hidden_size * 2, hidden_size) self.dropout = dropout self.embedding_dropout = embedding_dropout # Word2Vec rescale self.wv = word2vec.wv self.embedding = nn.Embedding( input_size, embed_size, padding_idx=padding_index) # embedding layer
def benchmark_sru_cpu(): print('-' * 60) print('SRU CPU benchmark:') rnn = SRU(input_size=input_size, hidden_size=hidden_size, bidirectional=(n_directions == 2), num_layers=1) input = torch.randn(seq_len, batch_size, input_size) h0 = torch.randn(n_layers, batch_size, hidden_size * n_directions) print('input.shape', input.shape) print('h0.shape', h0.shape) with torch.no_grad(): rnn.eval() output, hn = rnn(input, h0) print('output.shape', output.shape) print('hn.shape', hn.shape) n_iter = 1000 start = time.time() with torch.no_grad(): rnn.eval() for i in range(n_iter): rnn.forward(input) print('Time:', round((time.time() - start), 2), 'sec')
def _build_rnn(self, name, input, units, layers): if name == 'lstm': return torch.nn.LSTM(input, units, layers, batch_first=True) if name == 'gru': return torch.nn.GRU(input, units, layers, batch_first=True) if name == 'sru': from sru import SRU return SRU(input, units, layers, dropout=0, layer_norm=False)
def __init__(self, num_layers=4, hidden_size=512): """ BiGRU neural model, which finds minimums and maximums of time series with :param num_layers: number of GRU layers :param hidden_size: size of hidden GRU layers """ super().__init__() self.rnn = SRU(1, hidden_size, num_layers, bidirectional=True) self.classifier = nn.Linear(2 * hidden_size, 3)
def __init__(self, input_size: int, hidden_size: int, n_layers: int = 1, rnn_type: str = 'lstm', dropout: float = 0, layer_norm: bool = False, highway_bias: float = 0, rescale: bool = True, enforce_sorted: bool = False, attention: Optional[nn.Module] = None, activation: Optional[nn.Module] = None, **kwargs) -> None: """Initializes the RNNDecoder object. """ super().__init__() self.rnn_type = rnn_type self.input_size = input_size self.hidden_size = hidden_size self.enforce_sorted = enforce_sorted if rnn_type in ['lstm', 'gru']: if kwargs: logger.warn( f"The following '{kwargs}' will be ignored " + "as they are only considered when using 'sru' as " + "'rnn_type'") rnn_fn = nn.LSTM if rnn_type == 'lstm' else nn.GRU self.rnn = rnn_fn(input_size=input_size, hidden_size=hidden_size, num_layers=n_layers, dropout=dropout) elif rnn_type == 'sru': from sru import SRU try: self.rnn = SRU(input_size, hidden_size, num_layers=n_layers, dropout=dropout, layer_norm=layer_norm, rescale=rescale, highway_bias=highway_bias, **kwargs) except TypeError: raise ValueError(f"Unknown kwargs passed to SRU: {kwargs}") else: raise ValueError( f"Unknown rnn type: {rnn_type}, use of of: gru, sru, lstm") self.attention = attention if self.attention is not None: self.linear = torch.nn.Linear(in_features=hidden_size * 2, out_features=hidden_size) self.activation = activation
def __init__(self, input_size, gru_hidden_size, dropout): super(FeatureGeneratorSRU, self).__init__() self.dropout = dropout self.training = False self.rnn = SRU(input_size, gru_hidden_size, num_layers=4, use_tanh=0, use_relu=0, use_selu=1, weight_norm=True, dropout=0.2, bidirectional=True) self.norm = nn.LayerNorm(256)
def __init__(self, imgH, nc, nclass, nh, width=48, n_rnn=1, isSRU=True, leakyRelu=False, with_se=False, with_mean_max_pooling=False): super(CRNN, self).__init__() assert width % 2 == 0 channel = lambda i: (2**i) * width self.cnn = nn.Sequential( nn.BatchNorm2d(3, affine=False), conv_bn_relu(3, 32, 3, 2, 1), nn.MaxPool2d(2, 2, 0, ceil_mode=True), res_stageCRNN(32, channel(2), 4, with_se), res_stageCRNN(channel(2), channel(3), 8, with_se), #res_stageCRNN(channel(3), channel(4), 4, False), conv_bn_relu(channel(3), channel(4), 2, 1, 0), ) for name, m in self.named_modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal(m.weight, mode='fan_out') if m.bias is not None: nn.init.constant(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): nn.init.constant(m.running_mean, 0) nn.init.constant(m.running_var, 1) if m.weight is not None: nn.init.constant(m.weight, 1) if m.bias is not None: nn.init.constant(m.bias, 0) if False == isSRU: self.rnn = nn.LSTM(channel(4), nh, n_rnn, bidirectional=True) else: self.rnn = SRU( channel(4), nh, num_layers=n_rnn, # number of stacking RNN layers dropout=0.0, # dropout applied between RNN layers rnn_dropout= 0.0, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=True # bidirectional RNN ? ) self.embeddingCTC = nn.Linear(nh * 2, nclass) self.attention = Attention(nh * 2, nh, nclass, 256)
def __init__( self, input_dim=257, output_dim=257, hidden_layers=2, hidden_units=512, left_context=1, right_context=1, kernel_size=6, kernel_num=9, target_mode='MSA', dropout=0.2 ): super(SRUC, self).__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_layers = hidden_layers self.hidden_units = hidden_units self.left_context = left_context self.right_context = right_context self.kernel_size = kernel_size self.kernel_sum = kernel_num self.target_mode = target_mode self.input_layer = nn.Sequential( nn.Linear((left_context+1+right_context)*input_dim, hidden_units), nn.Tanh() ) self.rnn_layer = SRU( input_size=hidden_units, hidden_size=hidden_units, num_layers=self.hidden_layers, dropout=dropout, rescale=True, bidirectional=False, layer_norm=False ) self.conv2d_layer = nn.Sequential( #nn.Conv2d(in_channels=1,out_channels=kernel_num,kernel_size=(kernel_size, kernel_size), stride=[1,1],padding=(5,5), dilation=(2,2)), modules.Conv2d(in_channels=1, out_channels=kernel_num, kernel_size=(kernel_size, kernel_size)), nn.Tanh(), nn.MaxPool2d(3,stride=1,padding=(1,1)) ) self.output_layer = nn.Sequential( nn.Linear(hidden_units*kernel_num, (left_context+1+right_context)*self.output_dim), nn.Sigmoid() )
def __init__(self, nIn, nHidden, nOut, isSRU, nLayer): super(BidirectionalLSTM_Embed, self).__init__() if False == isSRU: self.rnn = nn.LSTM(nIn, nHidden, nLayer, bidirectional=True) else: self.rnn = SRU( nIn, nHidden, num_layers=nLayer, # number of stacking RNN layers dropout=0.0, # dropout applied between RNN layers rnn_dropout= 0.0, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=True # bidirectional RNN ? )
def test_packed(): N = 5 max_len = 7 V = 32 K = 8 K_out = 11 print('N', N, 'max_len', max_len, 'K', K, 'K_out', K_out) torch.manual_seed(123) np.random.seed(123) lengths = torch.from_numpy(np.random.choice(max_len, N)) + 1 tensors = [torch.from_numpy(np.random.choice(V, l, replace=True)) for l in lengths.tolist()] embedder = nn.Embedding(V, K) tensors = nn.utils.rnn.pad_sequence(tensors) print('tensors.size()', tensors.size()) embedded = embedder(tensors) print('embedded.size()', embedded.size()) packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=False, enforce_sorted=False) print(isinstance(packed, nn.utils.rnn.PackedSequence)) sru = SRU(K, K_out) out1, state = sru(packed) out1, lengths1 = nn.utils.rnn.pad_packed_sequence(out1) print('out1.size()', out1.size()) assert (lengths != lengths1).sum().item() == 0 print('out1.sum()', out1.sum().item()) # change one of the indexes taht should not be masked out tensors[6, 1] = 3 embedded = embedder(tensors) packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=False, enforce_sorted=False) out2, state = sru(packed) out2, lengths2 = nn.utils.rnn.pad_packed_sequence(out2) assert (lengths != lengths2).sum().item() == 0 print('out2.sum()', out2.sum().item()) assert out2.sum().item() == out1.sum().item() # change one of the indexes taht should be masked out tensors[1, 1] = 3 embedded = embedder(tensors) packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=False, enforce_sorted=False) out3, state = sru(packed) out3, lengths3 = nn.utils.rnn.pad_packed_sequence(out3) assert (lengths != lengths3).sum().item() == 0 print('out3.sum()', out3.sum().item()) assert out3.sum().item() != out1.sum().item()
def prepare(self, p): # input has length 20, batch size 32 and dimension 128 self.x = Variable(torch.rand(20, 32, 128).cuda()) input_size, hidden_size = 128, 128 self.rnn = SRU( input_size, hidden_size, num_layers=2, # number of stacking RNN layers dropout=0.00001, # dropout applied between RNN layers rnn_dropout= 0.0001, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=False, # bidirectional RNN ? use_kernel=p.use_kernel, ) self.rnn.cuda()
def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False, warmup=10, benchmark=20): assert not (jit and use_kernel) benchmark_init(0, 0, True) # input has length 20, batch size 32 and dimension 128 x = Variable(torch.rand(20, 32, 128).cuda()) input_size, hidden_size = 128, 128 rnn = SRU( input_size, hidden_size, num_layers=2, # number of stacking RNN layers dropout=0.00001, # dropout applied between RNN layers rnn_dropout= 0.0001, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=False, # bidirectional RNN ? use_kernel=use_kernel, jit=jit, ) rnn.cuda() kernel_tag = '_kernel' if use_kernel else '' backward_tag = '_training' if backward else '_forward' jit_tag = '_jit' if jit else '' name = 'sru{}{}{}'.format(backward_tag, kernel_tag, jit_tag) iter_timer = Bench(cuda=True, name=name, warmup_iters=warmup) for _ in range(warmup + benchmark): gc.collect() with iter_timer: output, hidden = rnn(x) # forward pass if backward: output.sum().backward() # output is (length, batch size, hidden size * number of directions) # hidden is (layers, batch size, hidden size * number of directions) return iter_timer