Python AttentionLayer 예제들, layers.AttentionLayer Python 예제들

예제 #1

0

파일 보기

def build_hybrid_model(emb_size=EMBEDDING_SIZE):
    numerical_input = Input(shape=(numerical_timestep, attribute_num))
    textual_input = Input(shape=(DATE_INTERVAL_NEWS, MAX_NEWS_NUM, emb_size))

    x1 = textual_input
    x1 = TimeDistributed(Masking(mask_value=0.))(x1)
    x1 = TimeDistributed(AttentionLayer())(x1)
    # X1 = TimeDistributed(Dropout(0.2, seed=35))(x1)
    x1 = TimeDistributed(Dense(100, activation='relu'))(x1)
    x1 = Bidirectional(GRU(50, return_sequences=True))(x1)
    x1 = AttentionLayer()(x1)
    # x1 = Dropout(0.2, seed=71)(x1)
    x1 = Dense(10, activation='relu')(x1)

    x2 = numerical_input
    x2 = GRU(100, return_sequences=True)(x2)
    x2 = Dropout(0.2, seed=2)(x2)
    x2 = GRU(100)(x2)
    x2 = Dropout(0.2, seed=7)(x2)
    x2 = Dense(10, activation='relu')(x2)

    x = concatenate([x1, x2])
    x = Dense(2, activation='softmax')(x)
    model = Model(inputs=[textual_input, numerical_input], outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

예제 #2

0

파일 보기

    def __init__(self, vocab_size=None, word_embed_dim=200, sent_len=None,
                 doc_len=None, n_classes=None, gru_dim=50,
                 pretrained_word_vectors=None, batch_size=64, verbose=2):
        
        # Constants
        self.batch_size = batch_size
        self.optimizer = 'adam'
        self.metrics = ['accuracy']

        # Parameters
        ## Word Embedding
        if pretrained_word_vectors is not None:
            if not isinstance(pretrained_word_vectors, list):
                pretrained_word_vectors = [pretrained_word_vectors]
            vocab_size = pretrained_word_vectors[0].shape[0]
            word_embed_dim = pretrained_word_vectors[0].shape[1]
        self.vocab_size = vocab_size
        self.word_embed_dim = word_embed_dim
        self.sent_len = sent_len
        self.verbose = verbose
        ## Word-Level BiGRU
        self.gru_dim = gru_dim
        self.sentence_encoder_input_shape = (self.sent_len,)

        self.doc_len = doc_len
        self.han_input_shape = (self.doc_len, self.sent_len,)
        ## Output Layer
        if not isinstance(n_classes, int) or n_classes < 1:
            raise(ValueError, "`n_classes` must be a positive integer.")
        if n_classes == 1:
            self.output_activation = 'sigmoid'
            self.loss = 'binary_crossentropy'
        else:
            self.output_activation = 'softmax'
            self.loss = 'categorical_crossentropy'


        self.word_embedding_layer = Embedding(self.vocab_size, self.word_embed_dim,
                                              input_length=self.sent_len,
                                              mask_zero=True,
                                              name='word_embeddings',
                                              weights=pretrained_word_vectors)
        self.word_bi_gru_layer = Bidirectional(GRU(self.gru_dim, return_sequences=True), name='word_bi_gru')
        self.word_attention_layer = AttentionLayer(name='word_attention')
        self.sentence_bi_gru_layer = Bidirectional(GRU(self.gru_dim, return_sequences=True), name='sentence_bi_gru')
        self.sentence_attention_layer = AttentionLayer(name='sentence_attention')
        self.sentence_weighted_average_layer = WeightedAverage(name='document_embedding')
        self.output_layer = Dense(n_classes, activation=self.output_activation, name='document_output')
        self.td_word_embedding_layer = TimeDistributedWithMasking(self.word_embedding_layer, name='td_word_embeddings',
                                                                  weights=pretrained_word_vectors)
        self.td_word_bi_gru_layer = TimeDistributedWithMasking(self.word_bi_gru_layer, name='td_word_bi_gru')
        self.td_word_attention_layer = TimeDistributedWithMasking(self.word_attention_layer, name='td_word_attention')
        self.td_word_weighted_average_layer = WeightedAverage(name='sentence_vectors')

        # Models
        self._td_word_attention = None
        self._sentence_attention = None
        self.han = None

예제 #3

0

파일 보기

파일: discriminator.py 프로젝트: jimmYA-1995/Self-Attention-GAN

def get_discriminator(config):
    df_dim = config['df_dim']
    img = Input(shape=(config['img_size'], config['img_size'], 3),
                batch_size=config['batch_size'],
                name='image')
    condition_label = Input(shape=(),
                            batch_size=config['batch_size'],
                            dtype=tf.int32,
                            name='condition_label')
    x = img

    # to handle different size of images.
    power = np.log2(config['img_size'] / 4).astype('int')  # 64->4; 128->5
    for p in range(power):
        x = Block(x, df_dim * 2**p)
        if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']:
            x = AttentionLayer()(x)

    if config['use_label']:
        x = tf.reduce_sum(x, axis=[1, 2])
        outputs = layers.Dense(1)(x)
        # embedding = layers.Embedding(config['num_classes'], df_dim * 2 ** (power-1))
        # label_feature = SpectralNormalization(embedding)(condition_label)
        label_feature = layers.Embedding(config['num_classes'], df_dim *
                                         2**(power - 1))(condition_label)
        outputs += tf.reduce_sum(x * label_feature, axis=1, keepdims=True)
        return Model(inputs=[img, condition_label], outputs=outputs)
    else:
        outputs = layers.Conv2D(1, 4, 1, padding='same')(x)
        return Model(inputs=[img, condition_label], outputs=outputs)

예제 #4

0

파일 보기

파일: generator.py 프로젝트: jimmYA-1995/Self-Attention-GAN

def get_generator(config):
    gf_dim = config['gf_dim']
    z = Input(shape=(config['z_dim'], ),
              batch_size=config['batch_size'],
              name='noisy')
    condition_label = Input(shape=(),
                            batch_size=config['batch_size'],
                            dtype=tf.int32,
                            name='condition_label')

    if config['use_label']:
        one_hot_label = tf.one_hot(condition_label,
                                   depth=config['num_classes'])
        x = layers.Concatenate()([x, one_hot_label])
    else:
        x = z

    x = SpectralNormalization(layers.Dense(4 * 4 * gf_dim * 16))(x)
    x = tf.reshape(x, [-1, 4, 4, gf_dim * 16])

    # to handle different size of images.
    power = np.log2(config['img_size'] / 4).astype('int')  # 64->4; 128->5

    for p in reversed(range(power)):
        x = Block(x, gf_dim * (2**p))
        if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']:
            x = AttentionLayer()(x)

    outputs = layers.Conv2D(3,
                            4,
                            1,
                            padding='same',
                            use_bias=False,
                            activation='tanh')(x)
    return Model(inputs=[z, condition_label], outputs=outputs)

예제 #5

0

파일 보기

    def __init__(self,
                 vocab_size=300,
                 emb_dim=300,
                 maxlen=10,
                 n_aspects=10,
                 pretrained_embeddings=None,
                 aspect_matrix=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.maxlen = maxlen
        self.n_aspects = n_aspects
        self.aspect_matrix = torch.from_numpy(
            aspect_matrix, ).to(TORCH_DEVICE).requires_grad_(
                requires_grad=True)

        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embeddings, freeze=True,
            padding_idx=0).to(TORCH_DEVICE).requires_grad_(requires_grad=True)
        #(voc_size, emb_dim)
        self.average_emb = AverageEmbedding()  #(maxlen, emb_dim)
        self.attention = AttentionLayer(emb_dim)
        self.weighted_emb = WeightedEmbeddings()
        self.linear = nn.Linear(emb_dim, n_aspects)
        self.weighted_aspects = WeightedAspects(self.aspect_matrix)

예제 #6

0

파일 보기

    def test_RNNDecoderLayer(self):
        rnn_cell_output_dim = 3
        rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True)
        attention_context_dim = 2
        attention = AttentionLayer(attention_context_dim=attention_context_dim)

        embedding_dim = 4
        embedding_vac_size = 5
        embedding = Embedding(input_dim=embedding_vac_size,
                              output_dim=embedding_dim,
                              weights=[
                                  np.array([[0, 0, 0, 0], [1, 2, 3, 4],
                                            [5, 6, 7, 8], [9, 1, 3, 4],
                                            [8, 7, 4, 2]])
                              ])
        layer = RNNDecoderLayer(rnn_cell, attention, embedding)
        # test config: should use custom objects for custom layers
        custom_objects = {AttentionLayer.__name__: AttentionLayer}
        self.assertEqual(
            layer.get_config(),
            RNNDecoderLayer.from_config(layer.get_config(),
                                        custom_objects).get_config(), "config")

        x = Input((None, ), dtype='int32')
        context = Input((None, embedding_dim))
        outputs = layer([x, context])
        self.assertEqual(outputs._keras_shape,
                         (None, None, rnn_cell_output_dim), "_keras_shape")
        f = K.function(inputs=[x, context], outputs=[outputs])
        x_val = [[1, 1, 3, 4], [1, 2, 4, 0]]
        context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]],
                       [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]]
        output_val = f([x_val, context_val])[0]
        self.assertEqual(output_val.shape, (2, 4, rnn_cell_output_dim),
                         "output_val")

예제 #7

0

파일 보기

파일: discriminator.py 프로젝트: jimmYA-1995/Self-Attention-GAN

def get_res_discriminator(config):
    df_dim = config['df_dim']
    img = Input(shape=(config['img_size'], config['img_size'], 3),
                name='image')
    power = np.log2(config['img_size'] / 4).astype('int')
    condition_label = Input(shape=(), dtype=tf.int32, name='condition_label')

    x = Optimized_Block(img, df_dim * 1)  # 64x64
    for p in range(1, power):
        x = Res_Block(x, df_dim * 2**p)  # 32x32
        if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']:
            x = AttentionLayer()(x)

    x = Res_Block(x, df_dim * 2**power, downsample=False)  # 4x4

    if config['use_label']:
        x = layers.ReLU()(x)
        x = tf.reduce_sum(x, axis=[1, 2])
        outputs = SpectralNormalization(layers.Dense(1))(x)

        # embedding = layers.Embedding(config['num_classes'], df_dim * 16)
        # label_feature = SpectralNormalization(embedding)(condition_label)
        label_feature = layers.Embedding(config['num_classes'],
                                         df_dim * 16)(condition_label)

        outputs += tf.reduce_sum(x * label_feature, axis=1, keepdims=True)
        return Model(inputs=[img, condition_label], outputs=outputs)
    else:
        outputs = layers.Conv2D(1, 4, 1, padding='same')(x)
        # outputs = SpectralNormalization(conv)(x)
        return Model(inputs=[img, condition_label], outputs=outputs)

예제 #8

0

파일 보기

파일: generator.py 프로젝트: jimmYA-1995/Self-Attention-GAN

def get_res_generator(config):
    gf_dim = config['gf_dim']
    z = Input(shape=(config['z_dim'], ), name='noisy')
    condition_label = Input(shape=(), dtype=tf.int32, name='condition_label')
    if config['use_label']:
        one_hot_label = tf.one_hot(condition_label, depth=num_classes)
        x = layers.Concatenate()([z, one_hot_label])
    else:
        x = z

    x = SpectralNormalization(layers.Dense(4 * 4 * gf_dim * 2**(power - 1)))(x)
    x = tf.reshape(x, [-1, 4, 4, gf_dim * 2**(power - 1)])

    # to handle different size of images.
    power = np.log2(config['img_size'] / 4).astype('int')
    for p in reversed(range(power)):
        x = Res_Block(x, gf_dim * 2**p)
        if config['use_attention'] and int(x.shape[1]) in config['attn_dim_G']:
            x = AttentionLayer()(x)

    # x = layers.BatchNormalization()(x)
    # x = layers.ReLU()(x)
    outputs = layers.Conv2D(3, 1, 1, padding='same', activation='tanh')(x)

    return Model(inputs=[z, condition_label], outputs=outputs)

예제 #9

0

파일 보기

    def compute_states(self, inputs, lengths):

        bi_states, _ = self.run_rnn(inputs, lengths)

        fw_out, bw_out = bi_states
        rnn_outputs = tf.concat(
            2, [fw_out, bw_out])  # [batch_size, num_steps, 2*size]

        atn_layer = AttentionLayer(in_dim=2 * self.hidden_size,
                                   dim=self.config.atn_hidden_size,
                                   num_steps=self.num_steps,
                                   name="Attention_Layer")

        hidden_vector = self.hidden_vector = atn_layer.get_output(
            fan_in=rnn_outputs, name="hidden_vector")

        return hidden_vector

예제 #10

0

파일 보기

def build_textual_model(emb_size=EMBEDDING_SIZE):
    news_input = Input(shape=(DATE_INTERVAL_NEWS, MAX_NEWS_NUM, emb_size))

    x = news_input
    x = TimeDistributed(Masking(mask_value=0.))(x)
    x = TimeDistributed(AttentionLayer())(x)
    # x = TimeDistributed(Dropout(0.2, seed=35))(x)
    x = TimeDistributed(Dense(100, activation='relu'))(x)

    x = Bidirectional(GRU(50, return_sequences=True))(x)
    x = AttentionLayer()(x)
    # x = Dropout(0.2, seed=71)(x)
    x = Dense(10, activation='relu')(x)
    x = Dense(2, activation='softmax')(x)
    model = Model(inputs=news_input, outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

예제 #11

0

파일 보기

    def __init__(self, num_feature, hidden_dim, num_class, class_hidden,
                 adj, gcn_adj, input_dropout, dropout, weight_dropout=0):
        super().__init__()
        self.num_feature = num_feature
        self.hidden_dim = hidden_dim
        self.num_class = num_class
        self.adj = adj
        self.gcn_adj = gcn_adj

        self.m1 = AttentionLayer(num_feature, hidden_dim, num_class, class_hidden, input_dropout,
                                 weight_dropout=weight_dropout)
        self.m2 = AttentionLayer(hidden_dim, num_class, class_hidden, class_hidden, input_dropout,
                                 weight_dropout=weight_dropout)

        self.g1 = GraphConvolution(num_feature, hidden_dim, weight_dropout=weight_dropout)
        self.g2 = GraphConvolution(hidden_dim, num_class, weight_dropout=weight_dropout)

        self.input_dropout = input_dropout
        self.dropout = dropout

예제 #12

0

파일 보기

 def __init__(self, src_w2i, src_i2w, tgt_w2i, tgt_i2w, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, encoder_n_layers, decoder_n_layers, encoder_drop_prob=0.5, decoder_drop_prob=0.5, lr = 0.01, teacher_forcing_ratio=0.5, gradient_clip = 5, model_store_path = None):
     super(LSTMEncoderDecoderAtt, self).__init__()
     
     self.encoder_hidden_dim = encoder_hidden_dim
     self.decoder_hidden_dim = decoder_hidden_dim
     self.decoder_n_layers = decoder_n_layers
     self.teacher_forcing_ratio = teacher_forcing_ratio
     self.gradient_clip = gradient_clip
     
     self.encoder = SimpleLSTMEncoderLayer(len(src_w2i), embedding_dim, encoder_hidden_dim, encoder_n_layers, encoder_drop_prob)
     self.decoder = SimpleLSTMDecoderLayer(len(tgt_w2i), embedding_dim, encoder_hidden_dim*2, decoder_hidden_dim, decoder_n_layers, decoder_drop_prob)
     self.attention = AttentionLayer(encoder_hidden_dim*2, decoder_hidden_dim) # *2 because encoder is bidirectional an thus hidden is double 
     
     self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=lr)        
     self.criterion = nn.CrossEntropyLoss(ignore_index=0)
     
     self.src_w2i = src_w2i        
     self.src_i2w = src_i2w
     self.tgt_w2i = tgt_w2i
     self.tgt_i2w = tgt_i2w
     self.epoch = 0
     self.lr = lr
     self.src_vocab_size = len(src_w2i)
     self.tgt_vocab_size = len(tgt_w2i)
     print("Source vocab size: {}".format(self.src_vocab_size))
     print("Target vocab size: {}".format(self.tgt_vocab_size))
     
     self.train_on_gpu=torch.cuda.is_available()        
     if(self.train_on_gpu):
         print('Training on GPU.')
     else:
         print('No GPU available, training on CPU.')
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     
     if model_store_path == None:
         self.model_store_path = os.path.dirname(os.path.realpath(__file__))
     else:
         self.model_store_path = model_store_path
     if not os.path.exists(model_store_path):
         os.makedirs(model_store_path)
         
     self.log_path = os.path.join(self.model_store_path,"log")
     self.log = Log(self.log_path, clear=True)

예제 #13

0

파일 보기

    def test_RNNDecoderLayerWithBeamSearch(self):
        rnn_cell_output_dim = 3
        rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True)
        attention_context_dim = 2
        attention = AttentionLayer(attention_context_dim=attention_context_dim)

        embedding_dim = 4
        embedding_vac_size = 5
        embedding = Embedding(input_dim=embedding_vac_size,
                              output_dim=embedding_dim)
        classifier_output_layer = Dense(output_dim=embedding_vac_size,
                                        activation='softmax')
        hidden_unit_numbers = [2, 3, 4]
        hidden_unit_activation_functions = ['relu', 'relu', 'relu']
        hidden_layers = []
        for hidden_unit_number, hidden_unit_activation_function in zip(
                hidden_unit_numbers, hidden_unit_activation_functions):
            layer = Dense(hidden_unit_number,
                          activation=hidden_unit_activation_function)
            hidden_layers.append(layer)

        mlp_classifier = MLPClassifierLayer(classifier_output_layer,
                                            hidden_layers)
        layer = RNNDecoderLayerWithBeamSearch(mlp_classifier=mlp_classifier,
                                              max_output_length=2,
                                              beam_size=3,
                                              rnn_cell=rnn_cell,
                                              attention=attention,
                                              embedding=embedding)
        # test config: should use custom objects for custom layers
        custom_objects = {
            AttentionLayer.__name__: AttentionLayer,
            MLPClassifierLayer.__name__: MLPClassifierLayer
        }
        self.assertEqual(
            layer.get_config(),
            RNNDecoderLayerWithBeamSearch.from_config(
                layer.get_config(), custom_objects).get_config(), "config")
        initial_input = Input((1, ), dtype='int32')
        context = Input((None, embedding_dim))
        outputs = layer([initial_input, context])
        f = K.function(inputs=[initial_input, context], outputs=outputs)
        initial_input_val = [[0], [0]]  # two samples
        context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]],
                       [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]]
        outputs_val = f([initial_input_val, context_val])
        self.assertEqual(outputs_val[0].shape,
                         (layer.max_output_length, 2, layer.beam_size),
                         "output_label_id")
        self.assertEqual(outputs_val[1].shape,
                         (layer.max_output_length, 2, layer.beam_size),
                         "prev_output_index")
        self.assertEqual(outputs_val[2].shape,
                         (layer.max_output_length, 2, layer.beam_size),
                         "output_score")

예제 #14

0

파일 보기

    def test_RNNDecoderLayerBase(self):
        rnn_cell_output_dim = 3
        rnn_cell = GRU(output_dim=rnn_cell_output_dim, return_sequences=True)
        attention_context_dim = 2
        attention = AttentionLayer(attention_context_dim=attention_context_dim)

        embedding_dim = 4
        embedding_vac_size = 5
        embedding = Embedding(input_dim=embedding_vac_size,
                              output_dim=embedding_dim)
        layer = RNNDecoderLayerBase(rnn_cell, attention, embedding)
        # test config: should use custom objects for custom layers
        custom_objects = {AttentionLayer.__name__: AttentionLayer}
        self.assertEqual(
            layer.get_config(),
            RNNDecoderLayerBase.from_config(layer.get_config(),
                                            custom_objects).get_config(),
            "config")
        # test step: before calling step,build the layer first
        input_x_shape = (None, None)
        context_shape = (None, None, embedding_dim)
        layer.build(input_shapes=[input_x_shape, context_shape])

        x_step = K.placeholder((None, embedding_dim))
        context = K.placeholder((None, None, embedding_dim))
        state = K.placeholder((None, rnn_cell_output_dim))
        constants = rnn_cell.get_constants(K.expand_dims(x_step, 1))
        output, states = layer.step(x_step, [state] + constants, context)
        f = K.function(inputs=[x_step, context, state],
                       outputs=[output, states[0]])
        x_step_val = [[1, 2, 3, 4], [5, 6, 7, 8]]
        context_val = [[[0.1, 0.2, 0.3, 0.4], [0.3, 0.5, 0.7, 0.2]],
                       [[0.2, 0.1, 0.5, 0.6], [0.4, 0.3, 0.8, 0.1]]]
        state_val = [[1, 2, 3], [0.1, 0.2, 0.3]]
        outputs_val = f([x_step_val, context_val, state_val])
        rnn_cell_output_val = outputs_val[0]
        self.assertEqual(rnn_cell_output_val.shape, (2, rnn_cell_output_dim),
                         "rnn_cell_output_val")

예제 #15

0

파일 보기

    def test_AttentionLayer(self):
        attention_context_dim = 2

        init_W_a = np.array([[1, 2], [3, 4], [5, 6]])  # 3*2
        init_U_a = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])  # 4*2
        init_v_a = np.array([0.1, 0.2])

        layer = AttentionLayer(attention_context_dim=attention_context_dim,
                               weights=[init_W_a, init_U_a, init_v_a])
        # test config
        self.assertEqual(
            layer.get_config(),
            AttentionLayer.from_config(layer.get_config()).get_config(),
            "config")

        s = Input((3, ))  # current state tensor
        h = Input((None, 4))  # context
        self.assertEqual(layer([s, h])._keras_shape, (None, 4), "_keras_shape")

        tensors_to_debug = []
        output = AttentionLayer._calc(s,
                                      h,
                                      K.variable(init_W_a),
                                      K.variable(init_U_a),
                                      K.variable(init_v_a),
                                      tensors_to_debug=tensors_to_debug)

        # check with call to see detailed computation process
        f = K.function(inputs=[s, h], outputs=[output] + tensors_to_debug)
        s_val = [[1, 2, 3], [4, 5, 6]]
        h_val = [[[1, 2, 3, 4], [5, 6, 7, 8]],
                 [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]]
        output_val_ref = [[3, 4, 5, 6], [0.3, 0.4, 0.5, 0.6]]
        output_val_list = f([s_val, h_val])
        output_val = output_val_list[0]
        W_U_sum_val = output_val_list[3]
        W_U_sum_val_ref = [[[72., 88.], [136., 168.]], [[54., 70.],
                                                        [60.4, 78.]]]
        self.assertTrue(
            np.sum(np.abs(output_val - output_val_ref)) < 0.0001, 'output_val')
        self.assertTrue(
            np.sum(np.abs(W_U_sum_val - W_U_sum_val_ref)) < 0.0001,
            'W_U_sum_val')

예제 #16

0

파일 보기

class LSTMEncoderDecoderAtt(nn.Module):
    def __init__(self, src_w2i, src_i2w, tgt_w2i, tgt_i2w, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, encoder_n_layers, decoder_n_layers, encoder_drop_prob=0.5, decoder_drop_prob=0.5, lr = 0.01, teacher_forcing_ratio=0.5, gradient_clip = 5, model_store_path = None):
        super(LSTMEncoderDecoderAtt, self).__init__()
        
        self.encoder_hidden_dim = encoder_hidden_dim
        self.decoder_hidden_dim = decoder_hidden_dim
        self.decoder_n_layers = decoder_n_layers
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.gradient_clip = gradient_clip
        
        self.encoder = SimpleLSTMEncoderLayer(len(src_w2i), embedding_dim, encoder_hidden_dim, encoder_n_layers, encoder_drop_prob)
        self.decoder = SimpleLSTMDecoderLayer(len(tgt_w2i), embedding_dim, encoder_hidden_dim*2, decoder_hidden_dim, decoder_n_layers, decoder_drop_prob)
        self.attention = AttentionLayer(encoder_hidden_dim*2, decoder_hidden_dim) # *2 because encoder is bidirectional an thus hidden is double 
        
        self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=lr)        
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
        
        self.src_w2i = src_w2i        
        self.src_i2w = src_i2w
        self.tgt_w2i = tgt_w2i
        self.tgt_i2w = tgt_i2w
        self.epoch = 0
        self.lr = lr
        self.src_vocab_size = len(src_w2i)
        self.tgt_vocab_size = len(tgt_w2i)
        print("Source vocab size: {}".format(self.src_vocab_size))
        print("Target vocab size: {}".format(self.tgt_vocab_size))
        
        self.train_on_gpu=torch.cuda.is_available()        
        if(self.train_on_gpu):
            print('Training on GPU.')
        else:
            print('No GPU available, training on CPU.')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if model_store_path == None:
            self.model_store_path = os.path.dirname(os.path.realpath(__file__))
        else:
            self.model_store_path = model_store_path
        if not os.path.exists(model_store_path):
            os.makedirs(model_store_path)
            
        self.log_path = os.path.join(self.model_store_path,"log")
        self.log = Log(self.log_path, clear=True)
       
        
    def show_tensor(x, prediction=None, source=None): # x is a numpy 2d matrix
        fig = plt.figure(figsize=(12, 6))
        sns.heatmap(x,cmap="rainbow")
        plt.tight_layout()        
        return fig            
            
    def train(self, train_loader, valid_loader, test_loader, batch_size, patience = 10):                           
        current_patience = patience
        
        # move model to GPU, if available
        if(self.train_on_gpu):
            self.encoder.cuda()
            self.decoder.cuda()
            self.attention.cuda()
        
        best_loss = 1000000.
        best_epoch = -1
        while current_patience > 0:                  
            current_patience -= 1
            train_loss = self._train_epoch(train_loader)            
            self.save_checkpoint("last")
            
            eval_loss = self._eval(valid_loader)
            if eval_loss < best_loss:
                current_patience = patience
                best_loss = eval_loss
                best_epoch = self.epoch
                self.save_checkpoint("best")
            
            print("\nEpoch \033[93m{:d}\033[0m training loss \033[93m{:.6f}\033[0m, eval loss \033[93m{:.6f}\033[0m, best loss \033[93m{:.6f}\033[0m at epoch \033[93m{:d}\033[0m\n".format(self.epoch, train_loss, eval_loss, best_loss, best_epoch))
            
            
    def _train_epoch(self, train_loader):                       
        self.epoch += 1
        self.encoder.train()
        self.decoder.train()
        self.attention.train()        
        
        total_loss = 0.
        pbar = ProgressBar()
        pbar.set(total_steps=len(train_loader)) 
        
        for counter, (x, y) in enumerate(train_loader):
            batch_size = x.size(0)
            max_seq_len_x = x.size(1) # x este 64 x 399 (variabil)
            max_seq_len_y = y.size(1) # y este 64 x variabil

            
            pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, train average loss \033[93m{:.6f}\033[0m (mx/my = {}/{}) ... ".format(self.epoch, counter, len(train_loader), total_loss/(counter+1), max_seq_len_x, max_seq_len_y))                         
                        
            #if counter > 1:               
            #    break                
            if counter % 1000 == 0 and counter > 0:
                self.save_checkpoint("last")
            
            
            loss = 0            
            # print(x.size()) # x is a 64 * 399 tensor (batch*max_seq_len_x)               

            if(self.train_on_gpu):
                x, y = x.cuda(), y.cuda()
            
            encoder_hidden = self.encoder.init_hidden(batch_size)
            decoder_hidden = self.decoder.init_hidden(batch_size)
            #print(decoder_hidden[0].size())
            
            # zero grads in optimizer
            self.optimizer.zero_grad()                
            
            # encoder
            # x is batch_size x max_seq_len_x            
            encoder_output, encoder_hidden = self.encoder(x, encoder_hidden)             
            # encoder_output is batch_size x max_seq_len_x x encoder_hidden
            #print(encoder_output.size())
            
            # create first decoder output for initial attention call, extract from decoder_hidden
            decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512])
            # it should look like batch_size x 1 x decoder_hidden_size, so tranform it
            decoder_output = decoder_output[-1].permute(1,0,2) 
            #print(decoder_output.size())
                
            loss = 0                 
            for i in range(max_seq_len_y): # why decoder_hidden is initialized in epoch and not in batch??
                #print("\t Decoder step {}/{}".format(i, max_seq_len_y))    
                
                # teacher forcing (or it is first word which always is start-of-sentence)
                if random.random()<=self.teacher_forcing_ratio or i==0:
                    decoder_input = torch.zeros(batch_size, 1, dtype = torch.long, device=self.device) # 1 in middle is because lstm expects (batch, seq_len, input_size): 
                    for j in range(batch_size):
                        decoder_input[j]=y[j][i]                
                    #print(decoder_input.size()) # batch_size x 1                            
                else: # feed own previous prediction extracted from word_softmax_projection
                    _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes 
                    decoder_input = decoder_input.unsqueeze(1) # from batch_size to batch_size x 1                    
                    #print(decoder_input.size()) # batch_size x 1                            

                # remove me, for printing attention
                if counter == 1:
                    self.attention.should_print = False#True
                    #print("\t Decoder step {}/{}".format(i, max_seq_len_y))    
                else:
                    self.attention.should_print = False
                    self.attention.att_mat = []
                context = self.attention(encoder_output, decoder_output)
                
                # context is batch_size * encoder_hidden_dim            
                decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context)
                # first, reduce word_softmax_projection which is torch.Size([64, 1, 50004]) to 64 * 50004
                word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1
                
                # now, select target y
                # y looks like batch_size * max_seq_len_y : tensor([[    2, 10890, 48108,  ...,     0,     0,     0], ... ... ..
                target_y = y[:,i] # select from y the ith column and shape as an array 
                # target_y now looks like [ 10, 2323, 5739, 24, 9785 ... ] of size 64 (batch_size)
                #print(word_softmax_projection.size())
                #print(target_y.size())
                loss += self.criterion(word_softmax_projection, target_y) # ignore index not set as we want 0 to count to error too
            
            # remove me, attention printing
            """if counter == 1:
                fig = plt.figure(figsize=(12, 10))
                sns.heatmap(self.attention.att_mat,cmap="gist_heat")                
                plt.tight_layout()            
                fig.savefig('img/__'+str(self.epoch)+'.png')
                plt.clf()
            """    
            total_loss += loss.data.item()/batch_size
            loss.backward() # calculate the loss and perform backprop
            
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(self.encoder.parameters(), self.gradient_clip)
            nn.utils.clip_grad_norm_(self.decoder.parameters(), self.gradient_clip)
            nn.utils.clip_grad_norm_(self.attention.parameters(), self.gradient_clip)
            self.optimizer.step()
            # end batch
            
        # end current epoch
        pbar.update(text="Epoch {:d}, train done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss)) 
        self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=0)
        self.log.draw()
        
        return total_loss
    
    def run (self, data_loader, batch_size, beam_size=3): #data is either a list of lists or a dataset_loader
        self.encoder.eval()
        self.decoder.eval()
        self.attention.eval()            
        
        pbar = ProgressBar()
        pbar.set(total_steps=len(data_loader)) 
       
        total_loss = 0.
        with torch.no_grad():
            for counter, (x, y) in enumerate(data_loader):                
                pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(data_loader), total_loss/(counter+1)))  
                
                if x.size(0) != batch_size:
                    print("\t Incomplete batch, skipping.")
                    continue
                
                if(self.train_on_gpu):
                    x, y = x.cuda(), y.cuda()
                
                x = x[0:1,:]                
                y = y[0:1,:]
                results, scores, loss = self._run_instance(x, y, beam_size)
        
        pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(data_loader)))     
        return total_loss/len(data_loader)
        
    def _run_instance (self, x, y, beam_size):        
        from layers import Beam
        max_seq_len_x = x.size(1)
        max_seq_len_y = y.size(1)
        loss = 0
        
        # encoder
        encoder_hidden = self.encoder.init_hidden(batch_size=1)
        encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) 
        
        # decoder hidden init
        (d_hid, d_cell) = self.decoder.init_hidden(batch_size=beam_size)
        # split into hidden and cell states, and format into #torch.Size([2, 1, 64, 512])
        #d_a = decoder_hidden[0].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim)
        #d_b = decoder_hidden[1].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim)
                
        # init decoders (beam_size)
        beams = []
        for i in range(beam_size):        
            b = Beam()            
            #print( d_hid.size() ) # torch.Size([1, 3, 256]) 1 layer, 3 batch_size, 256 hidden
            b.current_decoder_hidden = (d_hid[:,i:i+1,:], d_cell[:,i:i+1,:])            
            b.sequence = [3] # set to BOS, which is 2, 3 is for dummy loader            
            beams.append(b)
            if i != 0: # force that in the first step all results come from the first beam
                b.score = -10000
            
                        
        #word_softmax_projection = torch.zeros(1, 5, dtype = torch.float, device=self.device)
        #word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it  #XXX
        
        # prepare decoder for initial attention computation
        #decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512])
        decoder_output = d_hid.view(self.decoder_n_layers, 1, beam_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512])
        decoder_output = decoder_output[-1].permute(1,0,2) 
                        
        loss = 0 
        total_loss = 0                
        example_array = []
        
        for i in range(max_seq_len_y): 
            print("\n\n\t Decoder step {}/{}".format(i, max_seq_len_y))                        
            
            # for decoder we need decoder_input, decoder_hidden and context
            # start with decoder_input: it is a batch_size * 1 containing 1 word index (previous)
            decoder_input_list = []
            for j in range(beam_size):        
                decoder_input_list.append([beams[j].sequence[-1]]) # select last word for each beam            
            decoder_input = torch.LongTensor(decoder_input_list, device = self.device)
            
            # compose decoder_hidden
            # final hidden should be tuple of ( torch.Size([1, 3, 256]), torch.Size([1, 3, 256]) ), meaning layers, beam_size, hidden_size
            d_hid, d_cell = beams[0].current_decoder_hidden[0], beams[0].current_decoder_hidden[1]
            #print(d_hid.size()) # this should be [1, 1, 256]            
            for j in range(1, beam_size): # now, vertically stack others so we get to [1, beam_size, 256] incrementally
                d_hid = torch.cat((d_hid, beams[j].current_decoder_hidden[0]),dim = 1)
                d_cell = torch.cat((d_cell, beams[j].current_decoder_hidden[1]),dim = 1)
                #print(d_hid.size())
            decoder_hidden = (d_hid, d_cell)
            # calculate context for each
            context = self.attention(encoder_output, decoder_output)
            
            #_, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes 
            #decoder_input = decoder_input.unsqueeze(1)                                           
                        
            decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context)                                
            word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1            
            #print(word_softmax_projection.size()) # size beam_size x vocab_size
            
            # check for stopping condition                        
            stopped_count = 0
            beam_scores = []
            for j in range(beam_size):
                _, mi = word_softmax_projection[j].max(0)
                if mi == 0: # PAD token, meaning this beam has finished
                    stopped_count +=1 
                    beam_scores.append([-10000]) # ensure no score gets selected from this beam
                else: 
                    beam_scores.append([beams[j].normalized_score()])
            if stopped_count == beam_size:
                print("Reached all beams predicted zero - early condition.")
                break
            
            #print(word_softmax_projection)    
            word_softmax_projection = F.softmax(word_softmax_projection, dim = 1)                
            word_softmax_projection = torch.log(word_softmax_projection) # logarithm of softmax scores
            beam_scores = torch.FloatTensor(beam_scores, device = self.device) # size beam_size x 1
            word_softmax_projection = word_softmax_projection + beam_scores # add logarithms            
                        
            # now, select top scoring values
            flattened_projection = word_softmax_projection.view(beam_size*self.vocab_size)
            max_scores, max_indices = torch.topk(flattened_projection, k = beam_size)
            max_scores = max_scores.cpu().numpy()
            max_indices = max_indices.cpu().numpy()
            
            # identify to which beam each one belongs to, and recreate beams
            new_beams = []
            for (score, index) in zip(max_scores, max_indices):
                belongs_to_beam = int(index/self.vocab_size)
                vocab_index = index%self.vocab_size
                print("Score {}, index {}, belongs to beam {}, vocab_index {}".format(score, index, belongs_to_beam, vocab_index))
                b = Beam()
                b.current_decoder_hidden = (decoder_hidden[0][:,belongs_to_beam:belongs_to_beam+1,:], decoder_hidden[1][:,belongs_to_beam:belongs_to_beam+1,:])
                b.sequence = beams[belongs_to_beam].sequence + [vocab_index]
                b.score = score
                new_beams.append(b)            
            beams = new_beams
            
            print(y.cpu().numpy()[0])
            for b in beams:
                print(str(b.sequence) + " " + str(b.normalized_score()))
            
            #if print_example:                        
            #    _, mi = word_softmax_projection[0].max(0)
            #    example_array.append(mi.item())
                
            #target_y = y[:,i] # select from y the ith column and shape as an array                    
            #loss += self.criterion(word_softmax_projection, target_y) 
        
        #total_loss += loss.data.item()  
        
        sequences = [ b.sequence for b in beams ]
        scores = [ b.normalized_score for b in beams ]
        
        return sequences, scores, total_loss
         
    
    def _run_batch_not_working (self, x, y, beam_size):
        batch_size = x.size(0)
        max_seq_len_x = x.size(1)
        max_seq_len_y = y.size(1)
        loss = 0
        
        encoder_hidden = self.encoder.init_hidden(batch_size)
        decoder_hidden = self.decoder.init_hidden(batch_size)
        
        encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) 
        word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device)
        word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it  #XXX
        
        decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512])
        decoder_output = decoder_output[-1].permute(1,0,2) 
                        
        loss = 0 
        total_loss = 0        
        print_example = True
        example_array = []
        
        for i in range(max_seq_len_y): 
            #print("\t Decoder step {}/{}".format(i, max_seq_len_y))                        
            _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes 
            decoder_input = decoder_input.unsqueeze(1)                                           
            context = self.attention(encoder_output, decoder_output)
            
            decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context)                    
            word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1
            if print_example:                        
                _, mi = word_softmax_projection[0].max(0)
                example_array.append(mi.item())
                
            target_y = y[:,i] # select from y the ith column and shape as an array                    
            loss += self.criterion(word_softmax_projection, target_y) 
        
        total_loss += loss.data.item()  
        return [], total_loss
        
    def _eval(self, valid_loader):                
        self.encoder.eval()
        self.decoder.eval()
        self.attention.eval()            
         
        pbar = ProgressBar()
        pbar.set(total_steps=len(valid_loader)) 
       
        counter = 0 
        total_loss = 0.
        with torch.no_grad():
            for counter, (x, y) in enumerate(valid_loader):                
                #if counter > 5:
                #    break
                pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(valid_loader), total_loss/(counter+1)))   
                
                batch_size = x.size(0)
                max_seq_len_x = x.size(1)
                max_seq_len_y = y.size(1)

                loss = 0
                                
                if(self.train_on_gpu):
                    x, y = x.cuda(), y.cuda()
                
                encoder_hidden = self.encoder.init_hidden(batch_size)
                decoder_hidden = self.decoder.init_hidden(batch_size)
       
                encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) 
                word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device)
                word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it  #XXX
                
                decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512])
                decoder_output = decoder_output[-1].permute(1,0,2) 
                                
                loss = 0             
                print_example = True
                example_array = []
                
                for i in range(max_seq_len_y): 
                    #print("\t Decoder step {}/{}".format(i, max_seq_len_y))                        
                    _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes 
                    decoder_input = decoder_input.unsqueeze(1)                                           
                    context = self.attention(encoder_output, decoder_output)
                    
                    decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context)                    
                    word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1
                    if print_example:                        
                        _, mi = word_softmax_projection[0].max(0)
                        example_array.append(mi.item())
                        
                    target_y = y[:,i] # select from y the ith column and shape as an array                    
                    loss += self.criterion(word_softmax_projection, target_y) 
                
                total_loss += loss.data.item() / batch_size    
                
                #print("\t\t\t Eval Loss: {}".format(loss.data.item()))
                if print_example:
                    print_example = False 
                    print()    
                    print("\n\n----- X:")
                    print(" ".join([self.src_i2w[str(wi.data.item())] for wi in x[0]]))                                            
                    print("----- Y:")
                    print(" ".join([self.tgt_i2w[str(wi.data.item())] for wi in y[0]]))                    
                    print("----- OUR PREDICTION:")
                    print(" ".join([self.tgt_i2w[str(wi)] for wi in example_array]))
                    print()
                    print(" ".join([str(wi.data.item()) for wi in y[0]]))
                    print(" ".join([str(wi) for wi in example_array]))
                    print()
            
        self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=1)
        self.log.draw()        
        
        pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(valid_loader))) 
    
        return total_loss/len(valid_loader)
    
    def old_run (self, input, max_output_len = 1000): # input is a list of lists of integers (variable len)
        self.encoder.eval()
        self.decoder.eval()
        self.attention.eval()          
        
        batch_size = len(input)
        encoder_hidden = self.encoder.init_hidden(batch_size)
        decoder_hidden = self.decoder.init_hidden(batch_size)       
            
        bordered_input = [ [2]+inst+[3] for inst in input ] # put start and end of sentence markers for each instance
        max_len = max(len(inst) for inst in bordered_input) # determines max size for all examples 

        input = np.array( [ inst + [0] * (max_len - len(inst)) for inst in bordered_input ] ) # input is now a max_len object padded with zeroes to the right (for all instances)        
        
        with torch.no_grad():            
            # numpy to tensor            
            x = torch.LongTensor(input)
            if(self.train_on_gpu):
                x = x.cuda()
            
                
            encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) 
            word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device)
            word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX remember to put 2 instead of 3 for non-dummy 
            
            decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) 
            decoder_output = decoder_output[-1].permute(1,0,2) 
                            
            output = [ [] for _ in range(batch_size) ]            
            for i in range(max_output_len):                
                _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes 
                decoder_input = decoder_input.unsqueeze(1)                                           
                context = self.attention(encoder_output, decoder_output)
                
                decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context)                    
                word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1
                
                zero_count = 0
                for j in range(batch_size):
                    _, mi = word_softmax_projection[j].max(0)                    
                    output[j].append(mi.cpu().item())
                    if mi.item() == 0:
                        zero_count += 1
                
                # check ending condition (all zeroes)
                if zero_count == batch_size:
                    break
        
        # transform back to numpy (and move back to CPU just in case it was on GPU)
        #output = output.numpy()
        
        # clean each array
        clean_output = []
        for instance in output:
            clean_instance = []
            for element in instance:
                if element > 3:
                    clean_instance.append(element)
            clean_output.append(clean_instance)
        
        return clean_output
        
    
    def load_checkpoint(self, filename):
        """if latest: # filename is a folder            
            import glob            
            files = glob.glob(os.path.join(filename,"*.ckp"))
            if files == None:
                raise Exception("Load checkpoint failed with latest=True. Returned list of files in folder [{}] is None".format(filename))            
            filename = sorted(files)[-1]            
            print("Loading latest model {} ...".format(filename))                   
        """
        filename = os.path.join(self.model_store_path,"model."+filename+".ckp")
        print("Loading model {} ...".format(filename))
        checkpoint = torch.load(filename)        
        self.encoder.load_state_dict(checkpoint["encoder_state_dict"])
        self.decoder.load_state_dict(checkpoint["decoder_state_dict"])        
        self.attention.load_state_dict(checkpoint["attention_state_dict"])
        #self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        self.src_w2i = checkpoint["src_w2i"]
        self.src_i2w = checkpoint["src_i2w"]
        self.tgt_w2i = checkpoint["tgt_w2i"]
        self.tgt_i2w = checkpoint["tgt_i2w"]  
        self.teacher_forcing_ratio = checkpoint["teacher_forcing_ratio"]
        self.epoch = checkpoint["epoch"]
        self.gradient_clip = checkpoint["gradient_clip"]        

        self.encoder.to(self.device)
        self.decoder.to(self.device)
        self.attention.to(self.device)
        #self.optimizer.to(self.device) # careful to continue training on the same device !
        self.optimizer = torch.optim.Adam(list(self.encoder.parameters())+list(self.decoder.parameters())+list(self.attention.parameters()), lr=self.lr)        
        
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        for state in self.optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.cuda()
        
        
    def save_checkpoint(self, filename):        
        filename = os.path.join(self.model_store_path,"model."+filename+".ckp")
        
        checkpoint = {}
        checkpoint["encoder_state_dict"] = self.encoder.state_dict()
        checkpoint["decoder_state_dict"] = self.decoder.state_dict()
        checkpoint["attention_state_dict"] = self.attention.state_dict()
        checkpoint["optimizer_state_dict"] = self.optimizer.state_dict()
        checkpoint["src_w2i"] = self.src_w2i
        checkpoint["src_i2w"] = self.src_i2w
        checkpoint["tgt_w2i"] = self.tgt_w2i
        checkpoint["tgt_i2w"] = self.tgt_i2w
        checkpoint["teacher_forcing_ratio"] = self.teacher_forcing_ratio
        checkpoint["epoch"] = self.epoch
        checkpoint["gradient_clip"] = self.gradient_clip
        torch.save(checkpoint, filename)

예제 #17

0

파일 보기

파일: train_RNN.py 프로젝트: heikeadel/attention_methods

layer0 = BidirectionalEncoderSigmoid(representationsize, rnnH)

layer0representations = layer0.apply(layer0_input, layer0_mask)
layer0outputsize = 2 * rnnH
if combinationMethod != "onlyAtt":
    layer0output = layer0representations[
        ii, jj, :]  # take last hidden state as sentence representation
    layer0flattened = layer0output.flatten(2).reshape(
        (batch_size_var, layer0outputsize))

if "internalOnH" in attentionMethod:
    layer1input = layer0representations.dimshuffle(1, 2, 0)
    layer1 = AttentionLayer(rng,
                            thisInput=layer1input,
                            batchsize=batch_size_var,
                            dim1=layer0outputsize,
                            dim2=contextsize,
                            method=attentionMethod,
                            k=kattention)
    layer1outputsize = 2 * rnnH
elif "internalOnW" in attentionMethod:
    layer1input = T.tanh(x2)
    layer1 = AttentionLayer(rng,
                            thisInput=layer1input,
                            batchsize=batch_size_var,
                            dim1=representationsize,
                            dim2=contextsize,
                            method=attentionMethod,
                            k=kattention)
    layer1outputsize = representationsize
elif "externalOnH" in attentionMethod:

예제 #18

0

파일 보기

ii = length2 - 1
jj = T.arange(batch_size_var)

y = y.reshape((batch_size_var, ))

layer0 = BidirectionalEncoderSigmoid(representationsize, rnnH)

layer0representations = layer0.apply(layer0_input, layer0_mask)
layer0outputsize = 2 * rnnH
if combinationMethod != "onlyAtt":
  layer0output = layer0representations[ii,jj,:] # take last hidden state as sentence representation
  layer0flattened = layer0output.flatten(2).reshape((batch_size_var, layer0outputsize))

if "internalOnH" in attentionMethod:
  layer1input = layer0representations.dimshuffle(1,2,0)
  layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=layer0outputsize, dim2 = contextsize, method = attentionMethod, k = kattention)
  layer1outputsize = 2 * rnnH
elif "internalOnW" in attentionMethod:
  layer1input = T.tanh(x2)
  layer1 = AttentionLayer(rng, thisInput=layer1input, batchsize=batch_size_var, dim1=representationsize, dim2 = contextsize, method = attentionMethod, k = kattention)
  layer1outputsize = representationsize
else:
  print "ERROR: unknown attentionMethod - skipping attention"
  combinationMethod = "noAtt"

if "Kmax" in attentionMethod and "Sequence" in attentionMethod:
  layer1outputsize = layer1outputsize * kattention

layer1flattened = layer1.output.flatten(2).reshape((batch_size_var, layer1outputsize))

if combinationMethod == "onlyAtt":

예제 #19

0

파일 보기

파일: train_CNN.py 프로젝트: heikeadel/attention_methods

layer0 = LeNetConvPoolLayer(rng,
                            W=convW,
                            b=convB,
                            input=layer0_input,
                            filter_shape=filter_shape,
                            poolsize=poolsize)

layer0flattened = layer0.output.flatten(2).reshape(
    (batch_size_var, nkerns[0] * sizeAfterPooling))
layer0outputsize = nkerns[0] * sizeAfterPooling

if "internalOnH" in attentionMethod:
    layer1 = AttentionLayer(rng,
                            thisInput=layer0.conv_out_tanh,
                            batchsize=batch_size_var,
                            dim1=nkerns[0],
                            dim2=sizeAfterConv,
                            method=attentionMethod,
                            k=kattention)
    layer1outputsize = nkerns[0]
elif "internalOnW" in attentionMethod:
    layer1 = AttentionLayer(rng,
                            thisInput=x.reshape(
                                (batch_size_var, ishape[0], ishape[1])),
                            batchsize=batch_size_var,
                            dim1=ishape[0],
                            dim2=ishape[1],
                            method=attentionMethod,
                            k=kattention)
    layer1outputsize = ishape[0]
elif "externalOnH" in attentionMethod:

예제 #20

0

파일 보기

    def __init__(self, enc_in, dec_in, c_out, out_len,
                 factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, group_factors=None,
                 group_operator='avg', group_step=1, dropout=0.0, attn='prob', embed='fixed', activation='gelu',
                 has_minute=False, has_hour=True):
        super(HLInformer, self).__init__()
        self.pred_len = out_len
        self.attn = attn

        if group_factors is None:
            group_factors = [4, 1]
        else:
            group_factors = [*group_factors, 1]

        self.group_factors = group_factors

        # Grouping
        self.group_layers = nn.ModuleList([GroupLayer(gf, group_operator, group_step) for gf in group_factors])
        # Encoding
        self.enc_embeddings = nn.ModuleList(
            [InformerDataEmbedding(enc_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors])
        self.dec_embeddings = nn.ModuleList(
            [InformerDataEmbedding(dec_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors])
        # Attention
        Attn = ProbAttention if attn == 'prob' else FullAttention
        # Encoder
        self.encoders = nn.ModuleList([Encoder(
            [
                EncoderLayer(
                    AttentionLayer(Attn(False, factor, attention_dropout=dropout),
                                   d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            [
                ConvLayer(
                    d_model
                ) for l in range(e_layers - 1)
            ],
            norm_layer=torch.nn.LayerNorm(d_model)
        ) for _ in group_factors])
        # Decoder
        self.decoders = nn.ModuleList([Decoder(
            [
                DecoderLayer(
                    AttentionLayer(FullAttention(True, factor, attention_dropout=dropout),
                                   d_model, n_heads),
                    AttentionLayer(FullAttention(False, factor, attention_dropout=dropout),
                                   d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation,
                )
                for l in range(d_layers)
            ],
            norm_layer=torch.nn.LayerNorm(d_model)
        ) for _ in group_factors])
        # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True)
        # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True)
        self.projections = nn.ModuleList(
            [nn.Linear(d_model * (i + 1), c_out, bias=True) for i, gf in enumerate(group_factors)])