def build_hybrid_model(emb_size=EMBEDDING_SIZE): numerical_input = Input(shape=(numerical_timestep, attribute_num)) textual_input = Input(shape=(DATE_INTERVAL_NEWS, MAX_NEWS_NUM, emb_size)) x1 = textual_input x1 = TimeDistributed(Masking(mask_value=0.))(x1) x1 = TimeDistributed(AttentionLayer())(x1) # X1 = TimeDistributed(Dropout(0.2, seed=35))(x1) x1 = TimeDistributed(Dense(100, activation='relu'))(x1) x1 = Bidirectional(GRU(50, return_sequences=True))(x1) x1 = AttentionLayer()(x1) # x1 = Dropout(0.2, seed=71)(x1) x1 = Dense(10, activation='relu')(x1) x2 = numerical_input x2 = GRU(100, return_sequences=True)(x2) x2 = Dropout(0.2, seed=2)(x2) x2 = GRU(100)(x2) x2 = Dropout(0.2, seed=7)(x2) x2 = Dense(10, activation='relu')(x2) x = concatenate([x1, x2]) x = Dense(2, activation='softmax')(x) model = Model(inputs=[textual_input, numerical_input], outputs=x) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def __init__(self, vocab_size=None, word_embed_dim=200, sent_len=None, doc_len=None, n_classes=None, gru_dim=50, pretrained_word_vectors=None, batch_size=64, verbose=2): # Constants self.batch_size = batch_size self.optimizer = 'adam' self.metrics = ['accuracy'] # Parameters ## Word Embedding if pretrained_word_vectors is not None: if not isinstance(pretrained_word_vectors, list): pretrained_word_vectors = [pretrained_word_vectors] vocab_size = pretrained_word_vectors[0].shape[0] word_embed_dim = pretrained_word_vectors[0].shape[1] self.vocab_size = vocab_size self.word_embed_dim = word_embed_dim self.sent_len = sent_len self.verbose = verbose ## Word-Level BiGRU self.gru_dim = gru_dim self.sentence_encoder_input_shape = (self.sent_len,) self.doc_len = doc_len self.han_input_shape = (self.doc_len, self.sent_len,) ## Output Layer if not isinstance(n_classes, int) or n_classes < 1: raise(ValueError, "`n_classes` must be a positive integer.") if n_classes == 1: self.output_activation = 'sigmoid' self.loss = 'binary_crossentropy' else: self.output_activation = 'softmax' self.loss = 'categorical_crossentropy' self.word_embedding_layer = Embedding(self.vocab_size, self.word_embed_dim, input_length=self.sent_len, mask_zero=True, name='word_embeddings', weights=pretrained_word_vectors) self.word_bi_gru_layer = Bidirectional(GRU(self.gru_dim, return_sequences=True), name='word_bi_gru') self.word_attention_layer = AttentionLayer(name='word_attention') self.sentence_bi_gru_layer = Bidirectional(GRU(self.gru_dim, return_sequences=True), name='sentence_bi_gru') self.sentence_attention_layer = AttentionLayer(name='sentence_attention') self.sentence_weighted_average_layer = WeightedAverage(name='document_embedding') self.output_layer = Dense(n_classes, activation=self.output_activation, name='document_output') self.td_word_embedding_layer = TimeDistributedWithMasking(self.word_embedding_layer, name='td_word_embeddings', weights=pretrained_word_vectors) self.td_word_bi_gru_layer = TimeDistributedWithMasking(self.word_bi_gru_layer, name='td_word_bi_gru') self.td_word_attention_layer = TimeDistributedWithMasking(self.word_attention_layer, name='td_word_attention') self.td_word_weighted_average_layer = WeightedAverage(name='sentence_vectors') # Models self._td_word_attention = None self._sentence_attention = None self.han = None
def get_discriminator(config): df_dim = config['df_dim'] img = Input(shape=(config['img_size'], config['img_size'], 3), batch_size=config['batch_size'], name='image') condition_label = Input(shape=(), batch_size=config['batch_size'], dtype=tf.int32, name='condition_label') x = img # to handle different size of images. power = np.log2(config['img_size'] / 4).astype('int') # 64->4; 128->5 for p in range(power): x = Block(x, df_dim * 2**p) if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']: x = AttentionLayer()(x) if config['use_label']: x = tf.reduce_sum(x, axis=[1, 2]) outputs = layers.Dense(1)(x) # embedding = layers.Embedding(config['num_classes'], df_dim * 2 ** (power-1)) # label_feature = SpectralNormalization(embedding)(condition_label) label_feature = layers.Embedding(config['num_classes'], df_dim * 2**(power - 1))(condition_label) outputs += tf.reduce_sum(x * label_feature, axis=1, keepdims=True) return Model(inputs=[img, condition_label], outputs=outputs) else: outputs = layers.Conv2D(1, 4, 1, padding='same')(x) return Model(inputs=[img, condition_label], outputs=outputs)
def get_generator(config): gf_dim = config['gf_dim'] z = Input(shape=(config['z_dim'], ), batch_size=config['batch_size'], name='noisy') condition_label = Input(shape=(), batch_size=config['batch_size'], dtype=tf.int32, name='condition_label') if config['use_label']: one_hot_label = tf.one_hot(condition_label, depth=config['num_classes']) x = layers.Concatenate()([x, one_hot_label]) else: x = z x = SpectralNormalization(layers.Dense(4 * 4 * gf_dim * 16))(x) x = tf.reshape(x, [-1, 4, 4, gf_dim * 16]) # to handle different size of images. power = np.log2(config['img_size'] / 4).astype('int') # 64->4; 128->5 for p in reversed(range(power)): x = Block(x, gf_dim * (2**p)) if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']: x = AttentionLayer()(x) outputs = layers.Conv2D(3, 4, 1, padding='same', use_bias=False, activation='tanh')(x) return Model(inputs=[z, condition_label], outputs=outputs)
def __init__(self, vocab_size=300, emb_dim=300, maxlen=10, n_aspects=10, pretrained_embeddings=None, aspect_matrix=None): super().__init__() self.vocab_size = vocab_size self.emb_dim = emb_dim self.maxlen = maxlen self.n_aspects = n_aspects self.aspect_matrix = torch.from_numpy( aspect_matrix, ).to(TORCH_DEVICE).requires_grad_( requires_grad=True) self.embedding = nn.Embedding.from_pretrained( pretrained_embeddings, freeze=True, padding_idx=0).to(TORCH_DEVICE).requires_grad_(requires_grad=True) #(voc_size, emb_dim) self.average_emb = AverageEmbedding() #(maxlen, emb_dim) self.attention = AttentionLayer(emb_dim) self.weighted_emb = WeightedEmbeddings() self.linear = nn.Linear(emb_dim, n_aspects) self.weighted_aspects = WeightedAspects(self.aspect_matrix)
def test_RNNDecoderLayer(self): rnn_cell_output_dim = 3 rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True) attention_context_dim = 2 attention = AttentionLayer(attention_context_dim=attention_context_dim) embedding_dim = 4 embedding_vac_size = 5 embedding = Embedding(input_dim=embedding_vac_size, output_dim=embedding_dim, weights=[ np.array([[0, 0, 0, 0], [1, 2, 3, 4], [5, 6, 7, 8], [9, 1, 3, 4], [8, 7, 4, 2]]) ]) layer = RNNDecoderLayer(rnn_cell, attention, embedding) # test config: should use custom objects for custom layers custom_objects = {AttentionLayer.__name__: AttentionLayer} self.assertEqual( layer.get_config(), RNNDecoderLayer.from_config(layer.get_config(), custom_objects).get_config(), "config") x = Input((None, ), dtype='int32') context = Input((None, embedding_dim)) outputs = layer([x, context]) self.assertEqual(outputs._keras_shape, (None, None, rnn_cell_output_dim), "_keras_shape") f = K.function(inputs=[x, context], outputs=[outputs]) x_val = [[1, 1, 3, 4], [1, 2, 4, 0]] context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]], [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]] output_val = f([x_val, context_val])[0] self.assertEqual(output_val.shape, (2, 4, rnn_cell_output_dim), "output_val")
def get_res_discriminator(config): df_dim = config['df_dim'] img = Input(shape=(config['img_size'], config['img_size'], 3), name='image') power = np.log2(config['img_size'] / 4).astype('int') condition_label = Input(shape=(), dtype=tf.int32, name='condition_label') x = Optimized_Block(img, df_dim * 1) # 64x64 for p in range(1, power): x = Res_Block(x, df_dim * 2**p) # 32x32 if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']: x = AttentionLayer()(x) x = Res_Block(x, df_dim * 2**power, downsample=False) # 4x4 if config['use_label']: x = layers.ReLU()(x) x = tf.reduce_sum(x, axis=[1, 2]) outputs = SpectralNormalization(layers.Dense(1))(x) # embedding = layers.Embedding(config['num_classes'], df_dim * 16) # label_feature = SpectralNormalization(embedding)(condition_label) label_feature = layers.Embedding(config['num_classes'], df_dim * 16)(condition_label) outputs += tf.reduce_sum(x * label_feature, axis=1, keepdims=True) return Model(inputs=[img, condition_label], outputs=outputs) else: outputs = layers.Conv2D(1, 4, 1, padding='same')(x) # outputs = SpectralNormalization(conv)(x) return Model(inputs=[img, condition_label], outputs=outputs)
def get_res_generator(config): gf_dim = config['gf_dim'] z = Input(shape=(config['z_dim'], ), name='noisy') condition_label = Input(shape=(), dtype=tf.int32, name='condition_label') if config['use_label']: one_hot_label = tf.one_hot(condition_label, depth=num_classes) x = layers.Concatenate()([z, one_hot_label]) else: x = z x = SpectralNormalization(layers.Dense(4 * 4 * gf_dim * 2**(power - 1)))(x) x = tf.reshape(x, [-1, 4, 4, gf_dim * 2**(power - 1)]) # to handle different size of images. power = np.log2(config['img_size'] / 4).astype('int') for p in reversed(range(power)): x = Res_Block(x, gf_dim * 2**p) if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']: x = AttentionLayer()(x) # x = layers.BatchNormalization()(x) # x = layers.ReLU()(x) outputs = layers.Conv2D(3, 1, 1, padding='same', activation='tanh')(x) return Model(inputs=[z, condition_label], outputs=outputs)
def compute_states(self, inputs, lengths): bi_states, _ = self.run_rnn(inputs, lengths) fw_out, bw_out = bi_states rnn_outputs = tf.concat( 2, [fw_out, bw_out]) # [batch_size, num_steps, 2*size] atn_layer = AttentionLayer(in_dim=2 * self.hidden_size, dim=self.config.atn_hidden_size, num_steps=self.num_steps, name="Attention_Layer") hidden_vector = self.hidden_vector = atn_layer.get_output( fan_in=rnn_outputs, name="hidden_vector") return hidden_vector
def build_textual_model(emb_size=EMBEDDING_SIZE): news_input = Input(shape=(DATE_INTERVAL_NEWS, MAX_NEWS_NUM, emb_size)) x = news_input x = TimeDistributed(Masking(mask_value=0.))(x) x = TimeDistributed(AttentionLayer())(x) # x = TimeDistributed(Dropout(0.2, seed=35))(x) x = TimeDistributed(Dense(100, activation='relu'))(x) x = Bidirectional(GRU(50, return_sequences=True))(x) x = AttentionLayer()(x) # x = Dropout(0.2, seed=71)(x) x = Dense(10, activation='relu')(x) x = Dense(2, activation='softmax')(x) model = Model(inputs=news_input, outputs=x) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def __init__(self, num_feature, hidden_dim, num_class, class_hidden, adj, gcn_adj, input_dropout, dropout, weight_dropout=0): super().__init__() self.num_feature = num_feature self.hidden_dim = hidden_dim self.num_class = num_class self.adj = adj self.gcn_adj = gcn_adj self.m1 = AttentionLayer(num_feature, hidden_dim, num_class, class_hidden, input_dropout, weight_dropout=weight_dropout) self.m2 = AttentionLayer(hidden_dim, num_class, class_hidden, class_hidden, input_dropout, weight_dropout=weight_dropout) self.g1 = GraphConvolution(num_feature, hidden_dim, weight_dropout=weight_dropout) self.g2 = GraphConvolution(hidden_dim, num_class, weight_dropout=weight_dropout) self.input_dropout = input_dropout self.dropout = dropout
def __init__(self, src_w2i, src_i2w, tgt_w2i, tgt_i2w, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, encoder_n_layers, decoder_n_layers, encoder_drop_prob=0.5, decoder_drop_prob=0.5, lr = 0.01, teacher_forcing_ratio=0.5, gradient_clip = 5, model_store_path = None): super(LSTMEncoderDecoderAtt, self).__init__() self.encoder_hidden_dim = encoder_hidden_dim self.decoder_hidden_dim = decoder_hidden_dim self.decoder_n_layers = decoder_n_layers self.teacher_forcing_ratio = teacher_forcing_ratio self.gradient_clip = gradient_clip self.encoder = SimpleLSTMEncoderLayer(len(src_w2i), embedding_dim, encoder_hidden_dim, encoder_n_layers, encoder_drop_prob) self.decoder = SimpleLSTMDecoderLayer(len(tgt_w2i), embedding_dim, encoder_hidden_dim*2, decoder_hidden_dim, decoder_n_layers, decoder_drop_prob) self.attention = AttentionLayer(encoder_hidden_dim*2, decoder_hidden_dim) # *2 because encoder is bidirectional an thus hidden is double self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=lr) self.criterion = nn.CrossEntropyLoss(ignore_index=0) self.src_w2i = src_w2i self.src_i2w = src_i2w self.tgt_w2i = tgt_w2i self.tgt_i2w = tgt_i2w self.epoch = 0 self.lr = lr self.src_vocab_size = len(src_w2i) self.tgt_vocab_size = len(tgt_w2i) print("Source vocab size: {}".format(self.src_vocab_size)) print("Target vocab size: {}".format(self.tgt_vocab_size)) self.train_on_gpu=torch.cuda.is_available() if(self.train_on_gpu): print('Training on GPU.') else: print('No GPU available, training on CPU.') self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if model_store_path == None: self.model_store_path = os.path.dirname(os.path.realpath(__file__)) else: self.model_store_path = model_store_path if not os.path.exists(model_store_path): os.makedirs(model_store_path) self.log_path = os.path.join(self.model_store_path,"log") self.log = Log(self.log_path, clear=True)
def test_RNNDecoderLayerWithBeamSearch(self): rnn_cell_output_dim = 3 rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True) attention_context_dim = 2 attention = AttentionLayer(attention_context_dim=attention_context_dim) embedding_dim = 4 embedding_vac_size = 5 embedding = Embedding(input_dim=embedding_vac_size, output_dim=embedding_dim) classifier_output_layer = Dense(output_dim=embedding_vac_size, activation='softmax') hidden_unit_numbers = [2, 3, 4] hidden_unit_activation_functions = ['relu', 'relu', 'relu'] hidden_layers = [] for hidden_unit_number, hidden_unit_activation_function in zip( hidden_unit_numbers, hidden_unit_activation_functions): layer = Dense(hidden_unit_number, activation=hidden_unit_activation_function) hidden_layers.append(layer) mlp_classifier = MLPClassifierLayer(classifier_output_layer, hidden_layers) layer = RNNDecoderLayerWithBeamSearch(mlp_classifier=mlp_classifier, max_output_length=2, beam_size=3, rnn_cell=rnn_cell, attention=attention, embedding=embedding) # test config: should use custom objects for custom layers custom_objects = { AttentionLayer.__name__: AttentionLayer, MLPClassifierLayer.__name__: MLPClassifierLayer } self.assertEqual( layer.get_config(), RNNDecoderLayerWithBeamSearch.from_config( layer.get_config(), custom_objects).get_config(), "config") initial_input = Input((1, ), dtype='int32') context = Input((None, embedding_dim)) outputs = layer([initial_input, context]) f = K.function(inputs=[initial_input, context], outputs=outputs) initial_input_val = [[0], [0]] # two samples context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]], [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]] outputs_val = f([initial_input_val, context_val]) self.assertEqual(outputs_val[0].shape, (layer.max_output_length, 2, layer.beam_size), "output_label_id") self.assertEqual(outputs_val[1].shape, (layer.max_output_length, 2, layer.beam_size), "prev_output_index") self.assertEqual(outputs_val[2].shape, (layer.max_output_length, 2, layer.beam_size), "output_score")
def test_RNNDecoderLayerBase(self): rnn_cell_output_dim = 3 rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True) attention_context_dim = 2 attention = AttentionLayer(attention_context_dim=attention_context_dim) embedding_dim = 4 embedding_vac_size = 5 embedding = Embedding(input_dim=embedding_vac_size, output_dim=embedding_dim) layer = RNNDecoderLayerBase(rnn_cell, attention, embedding) # test config: should use custom objects for custom layers custom_objects = {AttentionLayer.__name__: AttentionLayer} self.assertEqual( layer.get_config(), RNNDecoderLayerBase.from_config(layer.get_config(), custom_objects).get_config(), "config") # test step: before calling step,build the layer first input_x_shape = (None, None) context_shape = (None, None, embedding_dim) layer.build(input_shapes=[input_x_shape, context_shape]) x_step = K.placeholder((None, embedding_dim)) context = K.placeholder((None, None, embedding_dim)) state = K.placeholder((None, rnn_cell_output_dim)) constants = rnn_cell.get_constants(K.expand_dims(x_step, 1)) output, states = layer.step(x_step, [state] + constants, context) f = K.function(inputs=[x_step, context, state], outputs=[output, states[0]]) x_step_val = [[1, 2, 3, 4], [5, 6, 7, 8]] context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]], [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]] state_val = [[1, 2, 3], [0.1, 0.2, 0.3]] outputs_val = f([x_step_val, context_val, state_val]) rnn_cell_output_val = outputs_val[0] self.assertEqual(rnn_cell_output_val.shape, (2, rnn_cell_output_dim), "rnn_cell_output_val")
def test_AttentionLayer(self): attention_context_dim = 2 init_W_a = np.array([[1, 2], [3, 4], [5, 6]]) # 3*2 init_U_a = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) # 4*2 init_v_a = np.array([0.1, 0.2]) layer = AttentionLayer(attention_context_dim=attention_context_dim, weights=[init_W_a, init_U_a, init_v_a]) # test config self.assertEqual( layer.get_config(), AttentionLayer.from_config(layer.get_config()).get_config(), "config") s = Input((3, )) # current state tensor h = Input((None, 4)) # context self.assertEqual(layer([s, h])._keras_shape, (None, 4), "_keras_shape") tensors_to_debug = [] output = AttentionLayer._calc(s, h, K.variable(init_W_a), K.variable(init_U_a), K.variable(init_v_a), tensors_to_debug=tensors_to_debug) # check with call to see detailed computation process f = K.function(inputs=[s, h], outputs=[output] + tensors_to_debug) s_val = [[1, 2, 3], [4, 5, 6]] h_val = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]] output_val_ref = [[3, 4, 5, 6], [0.3, 0.4, 0.5, 0.6]] output_val_list = f([s_val, h_val]) output_val = output_val_list[0] W_U_sum_val = output_val_list[3] W_U_sum_val_ref = [[[72., 88.], [136., 168.]], [[54., 70.], [60.4, 78.]]] self.assertTrue( np.sum(np.abs(output_val - output_val_ref)) < 0.0001, 'output_val') self.assertTrue( np.sum(np.abs(W_U_sum_val - W_U_sum_val_ref)) < 0.0001, 'W_U_sum_val')
class LSTMEncoderDecoderAtt(nn.Module): def __init__(self, src_w2i, src_i2w, tgt_w2i, tgt_i2w, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, encoder_n_layers, decoder_n_layers, encoder_drop_prob=0.5, decoder_drop_prob=0.5, lr = 0.01, teacher_forcing_ratio=0.5, gradient_clip = 5, model_store_path = None): super(LSTMEncoderDecoderAtt, self).__init__() self.encoder_hidden_dim = encoder_hidden_dim self.decoder_hidden_dim = decoder_hidden_dim self.decoder_n_layers = decoder_n_layers self.teacher_forcing_ratio = teacher_forcing_ratio self.gradient_clip = gradient_clip self.encoder = SimpleLSTMEncoderLayer(len(src_w2i), embedding_dim, encoder_hidden_dim, encoder_n_layers, encoder_drop_prob) self.decoder = SimpleLSTMDecoderLayer(len(tgt_w2i), embedding_dim, encoder_hidden_dim*2, decoder_hidden_dim, decoder_n_layers, decoder_drop_prob) self.attention = AttentionLayer(encoder_hidden_dim*2, decoder_hidden_dim) # *2 because encoder is bidirectional an thus hidden is double self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=lr) self.criterion = nn.CrossEntropyLoss(ignore_index=0) self.src_w2i = src_w2i self.src_i2w = src_i2w self.tgt_w2i = tgt_w2i self.tgt_i2w = tgt_i2w self.epoch = 0 self.lr = lr self.src_vocab_size = len(src_w2i) self.tgt_vocab_size = len(tgt_w2i) print("Source vocab size: {}".format(self.src_vocab_size)) print("Target vocab size: {}".format(self.tgt_vocab_size)) self.train_on_gpu=torch.cuda.is_available() if(self.train_on_gpu): print('Training on GPU.') else: print('No GPU available, training on CPU.') self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if model_store_path == None: self.model_store_path = os.path.dirname(os.path.realpath(__file__)) else: self.model_store_path = model_store_path if not os.path.exists(model_store_path): os.makedirs(model_store_path) self.log_path = os.path.join(self.model_store_path,"log") self.log = Log(self.log_path, clear=True) def show_tensor(x, prediction=None, source=None): # x is a numpy 2d matrix fig = plt.figure(figsize=(12, 6)) sns.heatmap(x,cmap="rainbow") plt.tight_layout() return fig def train(self, train_loader, valid_loader, test_loader, batch_size, patience = 10): current_patience = patience # move model to GPU, if available if(self.train_on_gpu): self.encoder.cuda() self.decoder.cuda() self.attention.cuda() best_loss = 1000000. best_epoch = -1 while current_patience > 0: current_patience -= 1 train_loss = self._train_epoch(train_loader) self.save_checkpoint("last") eval_loss = self._eval(valid_loader) if eval_loss < best_loss: current_patience = patience best_loss = eval_loss best_epoch = self.epoch self.save_checkpoint("best") print("\nEpoch \033[93m{:d}\033[0m training loss \033[93m{:.6f}\033[0m, eval loss \033[93m{:.6f}\033[0m, best loss \033[93m{:.6f}\033[0m at epoch \033[93m{:d}\033[0m\n".format(self.epoch, train_loss, eval_loss, best_loss, best_epoch)) def _train_epoch(self, train_loader): self.epoch += 1 self.encoder.train() self.decoder.train() self.attention.train() total_loss = 0. pbar = ProgressBar() pbar.set(total_steps=len(train_loader)) for counter, (x, y) in enumerate(train_loader): batch_size = x.size(0) max_seq_len_x = x.size(1) # x este 64 x 399 (variabil) max_seq_len_y = y.size(1) # y este 64 x variabil pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, train average loss \033[93m{:.6f}\033[0m (mx/my = {}/{}) ... ".format(self.epoch, counter, len(train_loader), total_loss/(counter+1), max_seq_len_x, max_seq_len_y)) #if counter > 1: # break if counter % 1000 == 0 and counter > 0: self.save_checkpoint("last") loss = 0 # print(x.size()) # x is a 64 * 399 tensor (batch*max_seq_len_x) if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) #print(decoder_hidden[0].size()) # zero grads in optimizer self.optimizer.zero_grad() # encoder # x is batch_size x max_seq_len_x encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) # encoder_output is batch_size x max_seq_len_x x encoder_hidden #print(encoder_output.size()) # create first decoder output for initial attention call, extract from decoder_hidden decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) # it should look like batch_size x 1 x decoder_hidden_size, so tranform it decoder_output = decoder_output[-1].permute(1,0,2) #print(decoder_output.size()) loss = 0 for i in range(max_seq_len_y): # why decoder_hidden is initialized in epoch and not in batch?? #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) # teacher forcing (or it is first word which always is start-of-sentence) if random.random()<=self.teacher_forcing_ratio or i==0: decoder_input = torch.zeros(batch_size, 1, dtype = torch.long, device=self.device) # 1 in middle is because lstm expects (batch, seq_len, input_size): for j in range(batch_size): decoder_input[j]=y[j][i] #print(decoder_input.size()) # batch_size x 1 else: # feed own previous prediction extracted from word_softmax_projection _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) # from batch_size to batch_size x 1 #print(decoder_input.size()) # batch_size x 1 # remove me, for printing attention if counter == 1: self.attention.should_print = False#True #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) else: self.attention.should_print = False self.attention.att_mat = [] context = self.attention(encoder_output, decoder_output) # context is batch_size * encoder_hidden_dim decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) # first, reduce word_softmax_projection which is torch.Size([64, 1, 50004]) to 64 * 50004 word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 # now, select target y # y looks like batch_size * max_seq_len_y : tensor([[ 2, 10890, 48108, ..., 0, 0, 0], ... ... .. target_y = y[:,i] # select from y the ith column and shape as an array # target_y now looks like [ 10, 2323, 5739, 24, 9785 ... ] of size 64 (batch_size) #print(word_softmax_projection.size()) #print(target_y.size()) loss += self.criterion(word_softmax_projection, target_y) # ignore index not set as we want 0 to count to error too # remove me, attention printing """if counter == 1: fig = plt.figure(figsize=(12, 10)) sns.heatmap(self.attention.att_mat,cmap="gist_heat") plt.tight_layout() fig.savefig('img/__'+str(self.epoch)+'.png') plt.clf() """ total_loss += loss.data.item()/batch_size loss.backward() # calculate the loss and perform backprop # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.encoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.decoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.attention.parameters(), self.gradient_clip) self.optimizer.step() # end batch # end current epoch pbar.update(text="Epoch {:d}, train done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss)) self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=0) self.log.draw() return total_loss def run (self, data_loader, batch_size, beam_size=3): #data is either a list of lists or a dataset_loader self.encoder.eval() self.decoder.eval() self.attention.eval() pbar = ProgressBar() pbar.set(total_steps=len(data_loader)) total_loss = 0. with torch.no_grad(): for counter, (x, y) in enumerate(data_loader): pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(data_loader), total_loss/(counter+1))) if x.size(0) != batch_size: print("\t Incomplete batch, skipping.") continue if(self.train_on_gpu): x, y = x.cuda(), y.cuda() x = x[0:1,:] y = y[0:1,:] results, scores, loss = self._run_instance(x, y, beam_size) pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(data_loader))) return total_loss/len(data_loader) def _run_instance (self, x, y, beam_size): from layers import Beam max_seq_len_x = x.size(1) max_seq_len_y = y.size(1) loss = 0 # encoder encoder_hidden = self.encoder.init_hidden(batch_size=1) encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) # decoder hidden init (d_hid, d_cell) = self.decoder.init_hidden(batch_size=beam_size) # split into hidden and cell states, and format into #torch.Size([2, 1, 64, 512]) #d_a = decoder_hidden[0].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) #d_b = decoder_hidden[1].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) # init decoders (beam_size) beams = [] for i in range(beam_size): b = Beam() #print( d_hid.size() ) # torch.Size([1, 3, 256]) 1 layer, 3 batch_size, 256 hidden b.current_decoder_hidden = (d_hid[:,i:i+1,:], d_cell[:,i:i+1,:]) b.sequence = [3] # set to BOS, which is 2, 3 is for dummy loader beams.append(b) if i != 0: # force that in the first step all results come from the first beam b.score = -10000 #word_softmax_projection = torch.zeros(1, 5, dtype = torch.float, device=self.device) #word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX # prepare decoder for initial attention computation #decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = d_hid.view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = decoder_output[-1].permute(1,0,2) loss = 0 total_loss = 0 example_array = [] for i in range(max_seq_len_y): print("\n\n\t Decoder step {}/{}".format(i, max_seq_len_y)) # for decoder we need decoder_input, decoder_hidden and context # start with decoder_input: it is a batch_size * 1 containing 1 word index (previous) decoder_input_list = [] for j in range(beam_size): decoder_input_list.append([beams[j].sequence[-1]]) # select last word for each beam decoder_input = torch.LongTensor(decoder_input_list, device = self.device) # compose decoder_hidden # final hidden should be tuple of ( torch.Size([1, 3, 256]), torch.Size([1, 3, 256]) ), meaning layers, beam_size, hidden_size d_hid, d_cell = beams[0].current_decoder_hidden[0], beams[0].current_decoder_hidden[1] #print(d_hid.size()) # this should be [1, 1, 256] for j in range(1, beam_size): # now, vertically stack others so we get to [1, beam_size, 256] incrementally d_hid = torch.cat((d_hid, beams[j].current_decoder_hidden[0]),dim = 1) d_cell = torch.cat((d_cell, beams[j].current_decoder_hidden[1]),dim = 1) #print(d_hid.size()) decoder_hidden = (d_hid, d_cell) # calculate context for each context = self.attention(encoder_output, decoder_output) #_, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes #decoder_input = decoder_input.unsqueeze(1) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 #print(word_softmax_projection.size()) # size beam_size x vocab_size # check for stopping condition stopped_count = 0 beam_scores = [] for j in range(beam_size): _, mi = word_softmax_projection[j].max(0) if mi == 0: # PAD token, meaning this beam has finished stopped_count +=1 beam_scores.append([-10000]) # ensure no score gets selected from this beam else: beam_scores.append([beams[j].normalized_score()]) if stopped_count == beam_size: print("Reached all beams predicted zero - early condition.") break #print(word_softmax_projection) word_softmax_projection = F.softmax(word_softmax_projection, dim = 1) word_softmax_projection = torch.log(word_softmax_projection) # logarithm of softmax scores beam_scores = torch.FloatTensor(beam_scores, device = self.device) # size beam_size x 1 word_softmax_projection = word_softmax_projection + beam_scores # add logarithms # now, select top scoring values flattened_projection = word_softmax_projection.view(beam_size*self.vocab_size) max_scores, max_indices = torch.topk(flattened_projection, k = beam_size) max_scores = max_scores.cpu().numpy() max_indices = max_indices.cpu().numpy() # identify to which beam each one belongs to, and recreate beams new_beams = [] for (score, index) in zip(max_scores, max_indices): belongs_to_beam = int(index/self.vocab_size) vocab_index = index%self.vocab_size print("Score {}, index {}, belongs to beam {}, vocab_index {}".format(score, index, belongs_to_beam, vocab_index)) b = Beam() b.current_decoder_hidden = (decoder_hidden[0][:,belongs_to_beam:belongs_to_beam+1,:], decoder_hidden[1][:,belongs_to_beam:belongs_to_beam+1,:]) b.sequence = beams[belongs_to_beam].sequence + [vocab_index] b.score = score new_beams.append(b) beams = new_beams print(y.cpu().numpy()[0]) for b in beams: print(str(b.sequence) + " " + str(b.normalized_score())) #if print_example: # _, mi = word_softmax_projection[0].max(0) # example_array.append(mi.item()) #target_y = y[:,i] # select from y the ith column and shape as an array #loss += self.criterion(word_softmax_projection, target_y) #total_loss += loss.data.item() sequences = [ b.sequence for b in beams ] scores = [ b.normalized_score for b in beams ] return sequences, scores, total_loss def _run_batch_not_working (self, x, y, beam_size): batch_size = x.size(0) max_seq_len_x = x.size(1) max_seq_len_y = y.size(1) loss = 0 encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device) word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = decoder_output[-1].permute(1,0,2) loss = 0 total_loss = 0 print_example = True example_array = [] for i in range(max_seq_len_y): #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) context = self.attention(encoder_output, decoder_output) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 if print_example: _, mi = word_softmax_projection[0].max(0) example_array.append(mi.item()) target_y = y[:,i] # select from y the ith column and shape as an array loss += self.criterion(word_softmax_projection, target_y) total_loss += loss.data.item() return [], total_loss def _eval(self, valid_loader): self.encoder.eval() self.decoder.eval() self.attention.eval() pbar = ProgressBar() pbar.set(total_steps=len(valid_loader)) counter = 0 total_loss = 0. with torch.no_grad(): for counter, (x, y) in enumerate(valid_loader): #if counter > 5: # break pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(valid_loader), total_loss/(counter+1))) batch_size = x.size(0) max_seq_len_x = x.size(1) max_seq_len_y = y.size(1) loss = 0 if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device) word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = decoder_output[-1].permute(1,0,2) loss = 0 print_example = True example_array = [] for i in range(max_seq_len_y): #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) context = self.attention(encoder_output, decoder_output) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 if print_example: _, mi = word_softmax_projection[0].max(0) example_array.append(mi.item()) target_y = y[:,i] # select from y the ith column and shape as an array loss += self.criterion(word_softmax_projection, target_y) total_loss += loss.data.item() / batch_size #print("\t\t\t Eval Loss: {}".format(loss.data.item())) if print_example: print_example = False print() print("\n\n----- X:") print(" ".join([self.src_i2w[str(wi.data.item())] for wi in x[0]])) print("----- Y:") print(" ".join([self.tgt_i2w[str(wi.data.item())] for wi in y[0]])) print("----- OUR PREDICTION:") print(" ".join([self.tgt_i2w[str(wi)] for wi in example_array])) print() print(" ".join([str(wi.data.item()) for wi in y[0]])) print(" ".join([str(wi) for wi in example_array])) print() self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=1) self.log.draw() pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(valid_loader))) return total_loss/len(valid_loader) def old_run (self, input, max_output_len = 1000): # input is a list of lists of integers (variable len) self.encoder.eval() self.decoder.eval() self.attention.eval() batch_size = len(input) encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) bordered_input = [ [2]+inst+[3] for inst in input ] # put start and end of sentence markers for each instance max_len = max(len(inst) for inst in bordered_input) # determines max size for all examples input = np.array( [ inst + [0] * (max_len - len(inst)) for inst in bordered_input ] ) # input is now a max_len object padded with zeroes to the right (for all instances) with torch.no_grad(): # numpy to tensor x = torch.LongTensor(input) if(self.train_on_gpu): x = x.cuda() encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device) word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX remember to put 2 instead of 3 for non-dummy decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) decoder_output = decoder_output[-1].permute(1,0,2) output = [ [] for _ in range(batch_size) ] for i in range(max_output_len): _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) context = self.attention(encoder_output, decoder_output) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 zero_count = 0 for j in range(batch_size): _, mi = word_softmax_projection[j].max(0) output[j].append(mi.cpu().item()) if mi.item() == 0: zero_count += 1 # check ending condition (all zeroes) if zero_count == batch_size: break # transform back to numpy (and move back to CPU just in case it was on GPU) #output = output.numpy() # clean each array clean_output = [] for instance in output: clean_instance = [] for element in instance: if element > 3: clean_instance.append(element) clean_output.append(clean_instance) return clean_output def load_checkpoint(self, filename): """if latest: # filename is a folder import glob files = glob.glob(os.path.join(filename,"*.ckp")) if files == None: raise Exception("Load checkpoint failed with latest=True. Returned list of files in folder [{}] is None".format(filename)) filename = sorted(files)[-1] print("Loading latest model {} ...".format(filename)) """ filename = os.path.join(self.model_store_path,"model."+filename+".ckp") print("Loading model {} ...".format(filename)) checkpoint = torch.load(filename) self.encoder.load_state_dict(checkpoint["encoder_state_dict"]) self.decoder.load_state_dict(checkpoint["decoder_state_dict"]) self.attention.load_state_dict(checkpoint["attention_state_dict"]) #self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) self.src_w2i = checkpoint["src_w2i"] self.src_i2w = checkpoint["src_i2w"] self.tgt_w2i = checkpoint["tgt_w2i"] self.tgt_i2w = checkpoint["tgt_i2w"] self.teacher_forcing_ratio = checkpoint["teacher_forcing_ratio"] self.epoch = checkpoint["epoch"] self.gradient_clip = checkpoint["gradient_clip"] self.encoder.to(self.device) self.decoder.to(self.device) self.attention.to(self.device) #self.optimizer.to(self.device) # careful to continue training on the same device ! self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=self.lr) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() def save_checkpoint(self, filename): filename = os.path.join(self.model_store_path,"model."+filename+".ckp") checkpoint = {} checkpoint["encoder_state_dict"] = self.encoder.state_dict() checkpoint["decoder_state_dict"] = self.decoder.state_dict() checkpoint["attention_state_dict"] = self.attention.state_dict() checkpoint["optimizer_state_dict"] = self.optimizer.state_dict() checkpoint["src_w2i"] = self.src_w2i checkpoint["src_i2w"] = self.src_i2w checkpoint["tgt_w2i"] = self.tgt_w2i checkpoint["tgt_i2w"] = self.tgt_i2w checkpoint["teacher_forcing_ratio"] = self.teacher_forcing_ratio checkpoint["epoch"] = self.epoch checkpoint["gradient_clip"] = self.gradient_clip torch.save(checkpoint, filename)
layer0 = BidirectionalEncoderSigmoid(representationsize, rnnH) layer0representations = layer0.apply(layer0_input, layer0_mask) layer0outputsize = 2 * rnnH if combinationMethod != "onlyAtt": layer0output = layer0representations[ ii, jj, :] # take last hidden state as sentence representation layer0flattened = layer0output.flatten(2).reshape( (batch_size_var, layer0outputsize)) if "internalOnH" in attentionMethod: layer1input = layer0representations.dimshuffle(1, 2, 0) layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=layer0outputsize, dim2=contextsize, method=attentionMethod, k=kattention) layer1outputsize = 2 * rnnH elif "internalOnW" in attentionMethod: layer1input = T.tanh(x2) layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=representationsize, dim2=contextsize, method=attentionMethod, k=kattention) layer1outputsize = representationsize elif "externalOnH" in attentionMethod:
ii = length2 - 1 jj = T.arange(batch_size_var) y = y.reshape((batch_size_var, )) layer0 = BidirectionalEncoderSigmoid(representationsize, rnnH) layer0representations = layer0.apply(layer0_input, layer0_mask) layer0outputsize = 2 * rnnH if combinationMethod != "onlyAtt": layer0output = layer0representations[ii,jj,:] # take last hidden state as sentence representation layer0flattened = layer0output.flatten(2).reshape((batch_size_var, layer0outputsize)) if "internalOnH" in attentionMethod: layer1input = layer0representations.dimshuffle(1,2,0) layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=layer0outputsize, dim2 = contextsize, method = attentionMethod, k = kattention) layer1outputsize = 2 * rnnH elif "internalOnW" in attentionMethod: layer1input = T.tanh(x2) layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=representationsize, dim2 = contextsize, method = attentionMethod, k = kattention) layer1outputsize = representationsize else: print "ERROR: unknown attentionMethod - skipping attention" combinationMethod = "noAtt" if "Kmax" in attentionMethod and "Sequence" in attentionMethod: layer1outputsize = layer1outputsize * kattention layer1flattened = layer1.output.flatten(2).reshape((batch_size_var, layer1outputsize)) if combinationMethod == "onlyAtt":
layer0 = LeNetConvPoolLayer(rng, W=convW, b=convB, input=layer0_input, filter_shape=filter_shape, poolsize=poolsize) layer0flattened = layer0.output.flatten(2).reshape( (batch_size_var, nkerns[0] * sizeAfterPooling)) layer0outputsize = nkerns[0] * sizeAfterPooling if "internalOnH" in attentionMethod: layer1 = AttentionLayer(rng, thisInput=layer0.conv_out_tanh, batchsize=batch_size_var, dim1=nkerns[0], dim2=sizeAfterConv, method=attentionMethod, k=kattention) layer1outputsize = nkerns[0] elif "internalOnW" in attentionMethod: layer1 = AttentionLayer(rng, thisInput=x.reshape( (batch_size_var, ishape[0], ishape[1])), batchsize=batch_size_var, dim1=ishape[0], dim2=ishape[1], method=attentionMethod, k=kattention) layer1outputsize = ishape[0] elif "externalOnH" in attentionMethod:
def __init__(self, enc_in, dec_in, c_out, out_len, factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, group_factors=None, group_operator='avg', group_step=1, dropout=0.0, attn='prob', embed='fixed', activation='gelu', has_minute=False, has_hour=True): super(HLInformer, self).__init__() self.pred_len = out_len self.attn = attn if group_factors is None: group_factors = [4, 1] else: group_factors = [*group_factors, 1] self.group_factors = group_factors # Grouping self.group_layers = nn.ModuleList([GroupLayer(gf, group_operator, group_step) for gf in group_factors]) # Encoding self.enc_embeddings = nn.ModuleList( [InformerDataEmbedding(enc_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors]) self.dec_embeddings = nn.ModuleList( [InformerDataEmbedding(dec_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors]) # Attention Attn = ProbAttention if attn == 'prob' else FullAttention # Encoder self.encoders = nn.ModuleList([Encoder( [ EncoderLayer( AttentionLayer(Attn(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation ) for l in range(e_layers) ], [ ConvLayer( d_model ) for l in range(e_layers - 1) ], norm_layer=torch.nn.LayerNorm(d_model) ) for _ in group_factors]) # Decoder self.decoders = nn.ModuleList([Decoder( [ DecoderLayer( AttentionLayer(FullAttention(True, factor, attention_dropout=dropout), d_model, n_heads), AttentionLayer(FullAttention(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation, ) for l in range(d_layers) ], norm_layer=torch.nn.LayerNorm(d_model) ) for _ in group_factors]) # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True) # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True) self.projections = nn.ModuleList( [nn.Linear(d_model * (i + 1), c_out, bias=True) for i, gf in enumerate(group_factors)])