示例#1
0
def get_model():

    embedin = Embedin(vocab_size, embed_size)
    embedout = Embedout(vocab_size, hidden_size)

    enc = Encoder(embedin, embed_size, hidden_size, n_layers)
    dec5 = Decoder(embedin, embed_size, hidden_size, n_layers)
    #enc7 = Encoder(embedin, embed_size, hidden_size, n_layers)
    dec7 = Decoder(embedin, embed_size, hidden_size, n_layers)
    atten = Attention(hidden_size)
    #atten7 = Attention(hidden_size)
    ae5 = Autoencoder(enc, dec5, atten, embedout, 13)
    ae7 = Autoencoder(enc, dec7, atten, embedout, 17)

    discriminator = Discriminator(hidden_size)
    discriminator2 = Discriminator2(vocab_size, embed_size, hidden_size)

    seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17)
    seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13)

    lm5 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0)
    lm7 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0)
    lm5.load_state_dict(torch.load('models/lm5_lstm_dropout.th'))
    lm7.load_state_dict(torch.load('models/lm7_lstm_dropout.th'))

    ae5 = ae5.cuda()
    ae7 = ae7.cuda()
    discriminator = discriminator.cuda()
    discriminator2 = discriminator2.cuda()
    seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17)
    seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13)
    lm5 = lm5.cuda()
    lm7 = lm7.cuda()
    return ae5, ae7, discriminator, discriminator2, seq2seq57, seq2seq75, lm5, lm7
 def __init__(self, dataset, args):
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.dataset = dataset
     self.encoder = ConvEncoder(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device)
     self.decoder = Decoder(self.dataset.num_ent(), self.dataset.num_rel() , self.device)
     self.discriminator = ComplEx(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device)
     
     self.args = args
     self.adversarial_loss = nn.BCEWithLogitsLoss()
     self.reconstruction_loss = nn.BCELoss()
    def __init__(self, inp_dim, out_dim, emb_dim, enc_hid, dec_hid, enc_drop,
                 dec_drop, epoch, clip, sparse_max, tf, max_length, vocab,
                 batch, device):
        self.inp_dim = inp_dim
        self.out_dim = out_dim
        self.emb_dim = emb_dim
        self.enc_hid = enc_hid
        self.dec_hid = dec_hid
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.tf = tf
        self.max_length = max_length
        self.batch = batch
        self.device = device
        self.vocab = vocab

        self.attn = Attention(enc_hid, dec_hid, sparse_max=sparse_max)
        self.enc = Encoder(inp_dim, emb_dim, enc_hid, dec_hid, enc_drop)
        self.dec = Decoder(out_dim, emb_dim, enc_hid, dec_hid, dec_drop,
                           self.attn)
        self.model = Seq2Seq(self.enc, self.dec, device).to(device)

        self.model.apply(self.init_weights)
        self.count_parameters()
        self.optimizer = optim.Adam(self.model.parameters())
        if sparse_max:
            self.criterion = SparsemaxLoss(ignore_index=0)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=0)  # pad_idx 0
        self.epoch = epoch
        self.clip = clip
示例#4
0
train_iterator, valid_iterator, test_iterator, SRC, TRG = Return_Data_Loaders()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

##########################################################################

model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

##########################################################################
示例#5
0
文件: main.py 项目: MagCAE/magcae
def main(args):
    
    dataset = args.dataset
    emb_output_dir = args.output
    epochs = args.epochs
    agg = args.agg
    p = args.p
    tr = args.tr
    lam = args.lam
    lose_func = args.loss

    # Preprocess dataset
    adj, views_features = load_data(dataset, num_views=3)
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    # Calculate pairwise simlarity.
    views_sim_matrix = {}
    views_feature_matrix = {}

    for view in list(views_features.keys()):
        feature_matrix = csc_matrix.todense(views_features[view])
        views_feature_matrix.update({view:feature_matrix})
 
    kernal = "rbf"
    if lose_func == 'all':
        attr_sim = cal_attr_sim(views_feature_matrix, dataset)
    else:
        attr_sim = 0

    # split nodes to train, valid and test datasets, 
    # remove test edges from train adjacent matrix. 
    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(dataset, adj)
    
    print("Masking edges Done!")
    adj = adj_train
    nx_G = nx.from_numpy_array(adj.toarray())
    num_nodes = adj.shape[0]
    adj_norm = preprocess_graph(adj)

    views_features_num = {}
    views_features_nonzero = {}
    for view in list(views_features.keys()):
        views_features[view] = sparse_to_tuple(views_features[view].tocoo())
        views_features_num.update({view:views_features[view][2][1]})
        views_features_nonzero.update({view:views_features[view][1].shape[0]})
    
    # Build model
    MagCAE = {}
    for view in list(views_features.keys()):
        x,y = views_features[view][2][0], views_features[view][2][1]
        model = GAE(y, views_features_nonzero[view], adj_norm, math.ceil(2*p*y), math.ceil(p*y))
        MagCAE.update({view:model})

    # Loss function and optimizer.
    # loss weight taken by each nodes to the total loss.
    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) /adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(adj.shape[0] * adj.shape[0] - adj.sum())*2
    optimizer = tf.keras.optimizers.Adam()

    adj_targ = adj_train + sp.eye(adj_train.shape[0])
    adj_targ = sparse_to_tuple(adj_targ)

    indices= np.array(adj_targ[0])
    values = np.array(adj_targ[1])
    dense_shape = np.array(adj_targ[2])
    sparse_targ = tf.SparseTensor(indices = indices,
                                    values = values,
                                    dense_shape = dense_shape)
    sparse_targ = tf.cast(sparse_targ, dtype=tf.float32)

    adj_targ = tf.sparse.to_dense(sparse_targ)
    adj_targ = tf.reshape(adj_targ,[-1])
    # Train and Evaluate Model
    # Training Loop:
    # In each epoch: views - > view_embedding -> aggregate embedding -> total loss ->  update gradients
    decoder = Decoder(100)

    for epoch in range(epochs):
        loss = 0
        start = time.time()

        with tf.GradientTape() as tape:
            ag_embedding ={}


            for VAE in list(MagCAE.keys()):
                v_embedding, a_hat = MagCAE[VAE](views_features[VAE])
                ag_embedding.update({VAE:v_embedding})

            # aggregate embeddings
            embedding, aggregator = aggregate_embeddings(ag_embedding, agg)
            # reconstruct a_hat
            a_hat = decoder(embedding)
            loss += loss_function(a_hat, adj_targ, pos_weight, norm, attr_sim, embedding, num_nodes, lam, lose_func)

        if agg == "weighted_concat":
            variables = MagCAE['view1'].trainable_variables + MagCAE['view2'].trainable_variables + MagCAE['view3'].trainable_variables + aggregator.trainable_variables

        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        # Evaluate on validate set
        embedding = np.array(embedding)
        roc_cur, ap_cur, _, _ = evaluate(val_edges, val_edges_false, adj_orig, embedding)

        print("Epoch {}: Val_Roc {:.4f}, Val_AP {:.4f}, Time Consumed {:.2f} sec\n".format(epoch+1, roc_cur, ap_cur, time.time()-start))

    print("Training Finished!")
    
    # Evaluation Result on test Edges
    test_embedding= {}
    for VAE in list(MagCAE.keys()):
        v_embedding, a_hat = MagCAE[VAE](views_features[VAE])
        test_embedding.update({VAE:v_embedding})

    # aggregate embeddings
    embedding, aggregator = aggregate_embeddings(test_embedding, agg)
    embedding = np.array(embedding) # embedding is a tensor, convert to np array.

    # reconstruct a_hat
    test_roc, test_ap, fpr, tpr = evaluate(test_edges, test_edges_false, adj_orig, embedding)
    print("MagCAE test result on {}".format(dataset))
    print("Test Roc: {}, Test AP: {}, P: {}, Training Ratio: {}, Lambda: {}.".format(test_roc, test_ap, p, tr, lam))
import pickle
from torch.utils.data import DataLoader
from my_dataloader import *
from create_vocabulary import *
from Model import Encoder, Decoder, Seq2Seq
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


#encoder = Encoder(input_dim=2999, name='emb_inspec.npy')
#decoder = Decoder(output_dim=2999, name='emb_inspec.npy')
encoder = Encoder()
decoder = Decoder()
model = Seq2Seq(encoder, decoder, device).to(device)
#model.load_state_dict(torch.load('train.pt'))

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

batch=64

tot_epoch = 100

vocab = np.load('vocab_kp20k2.npy', allow_pickle=True).item()
#vocab = np.load('vocab_inspec.npy', allow_pickle=True).item()
TRG_PAD_IDX = vocab('<pad>')
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
示例#7
0
def q4(training_images, epochs=1000, latent_dim=10, batch_size=500):
    """
    The main function of question 4.
    Trains a GLO model to map vectors from the unit sphere in latent_dim-dimensions to mnist digits.
    :param training_images: A training set of mnist digits whose distribution we attempt to learn.
    :param epochs: Number of training epochs.
    :param latent_dim: Dimension of the latent vectors in which we wish to learn the distribution.
    :param batch_size: Size of a single batch in training.
    """
    model = Decoder()
    # Creating noise vectors in latent_dim dimensions, and projecting them on the unit sphere.
    noise_data = np.random.normal(size=(training_images.shape[0],
                                        latent_dim)).astype('float32')
    noise_data = noise_data / (np.linalg.norm(noise_data, axis=1)[...,
                                                                  np.newaxis])
    # Creating training batches.
    training_batches = tf.data.Dataset.from_tensor_slices(
        (training_images, noise_data)).batch(batch_size)
    # Defining optimizers, and a loss metric.
    z_optimizer = tf.keras.optimizers.Adam(1e-2)
    glo_optimizer = tf.keras.optimizers.Adam(1e-3)
    training_loss_metric = tf.keras.metrics.Mean()
    training_loss = list()
    shown = False
    # Training loop.
    for epoch in range(1, epochs + 1):
        for original_images_batch, noise_batch in training_batches:
            # Representing the noise batch as a tensorflow variable so we could take the derivative of the loss w.r.t the batch.
            noise_vector_batch = tf.Variable(noise_batch)
            with tf.GradientTape() as z_tape, tf.GradientTape() as glo_tape:
                generated_images_batch = model(noise_vector_batch)
                z_loss = tf.square(
                    tf.norm(original_images_batch - generated_images_batch,
                            axis=1))
                glo_loss = tf.reduce_mean(z_loss)
            z_gradients = z_tape.gradient(z_loss, [
                noise_vector_batch,
            ])
            z_optimizer.apply_gradients(
                zip(z_gradients, [
                    noise_vector_batch,
                ]))
            glo_gradients = glo_tape.gradient(glo_loss,
                                              model.trainable_variables)
            glo_optimizer.apply_gradients(
                zip(glo_gradients, model.trainable_variables))
            training_loss_metric(glo_loss)
            # After applying the gradients, we need to update the original batch, which we want to train. We also project the resulting vectors to the unit sphere.
            noise_batch = noise_vector_batch / (tf.norm(
                noise_vector_batch, axis=1)[..., tf.newaxis])
            # Visualization of the progress is done using training data, and thus placed in the loop.
            if not shown:
                if (epoch in [1, epochs
                              ]) or (epochs < 5) or not (epoch %
                                                         (epochs // 5)):
                    fig = plt.figure()
                    plt.title('GLO Training Progress Visualization \nepoch {}'.
                              format(epoch))
                    plt.xticks([])
                    plt.yticks([])
                    noise = noise_batch[:3, :]
                    images = model(noise)
                    for i in range(1, 4):
                        fig.add_subplot(1, 3, i)
                        plt.xticks([])
                        plt.yticks([])
                        plt.imshow(images[i - 1, :, :, 0], cmap='gray')
                    shown = True
        training_loss.append(training_loss_metric.result())
        shown = False
        print("epoch {}: training loss - {} ".format(epoch, training_loss[-1]))

    plot_losses(training_loss, [])
    for original_images_batch, noise_batch in training_batches:
        display_generating_examples(model, noise_batch[:5, :])
        break
示例#8
0
    def _create_graph(self, DECODER_TYPE):
        self.raw_state = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS+1, 2], name='State')
        self.current_location = self.raw_state[:, -1]
        self.sampled_cost = tf.placeholder(tf.float32, [None, 1], name='Sampled_Cost')
        if Config.SEQUENCE_COST == 1:
            self.sampled_cost = tf.placeholder(tf.float32, [None, Config.NUM_OF_CUSTOMERS], name='Sampled_Cost')
        self.batch_size = tf.shape(self.raw_state)[0]
        self.keep_prob = tf.placeholder(tf.float32)
        self.global_step = tf.Variable(0, trainable=False, name='step')
        self.input_lengths = tf.convert_to_tensor([Config.NUM_OF_CUSTOMERS]*(self.batch_size))
        self.or_route = tf.placeholder(tf.int32, shape=[None, Config.NUM_OF_CUSTOMERS+1])
        self.or_cost = tf.placeholder(tf.float32, shape=[None, 1])
        self.difference_in_length = tf.reduce_mean(self.sampled_cost - self.or_cost)
        self.relative_length = tf.reduce_mean(self.sampled_cost/self.or_cost)
        if Config.SEQUENCE_COST == 1:
            self.relative_length = tf.reduce_mean(self.sampled_cost[:, 0]/self.or_cost)
        self.start_tokens = tf.placeholder(tf.int32, shape=[None])
        self.end_token = -1
        self.MA_baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False)
        if Config.SEQUENCE_COST == 1:
            self.MA_baseline = tf.Variable(tf.tile([0.0], [Config.NUM_OF_CUSTOMERS]), dtype=tf.float32, trainable=False)
            self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost, axis=0))
        else:
            self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost))
        if Config.STATE_EMBED == 1:
            self.with_depot_state = self.raw_state
            for i in range(0):
                self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1,
                                                         padding="SAME", activation=tf.nn.relu)
            self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1,
                                                     padding="VALID")
        else:
            self.with_depot_state = self.raw_state
        self.state = self.with_depot_state[:, :-1, :]
        self.old_probs = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS, Config.NUM_OF_CUSTOMERS])

        # ENCODER
        if Config.DIRECTION == 4 or Config.DIRECTION == 5 or Config.DIRECTION == 6:
            self.encoder_outputs = self.state
            self.encoder_state = None
        if Config.DIRECTION < 6 and Config.DIRECTION != 4 and Config.DIRECTION != 5 and Config.DIRECTION != 6:
            self.encoder_outputs, self.encoder_state = Encoder(self.state, self.keep_prob)

        # HELPERS
        self.training_index = tf.concat([tf.expand_dims(self.start_tokens, -1), self.or_route], axis=1)
        self.training_index = self.training_index[:, :-1]
        self.gather_ids = tf.concat([tf.expand_dims(
            tf.reshape(tf.tile(tf.reshape(tf.range(self.batch_size), [-1, 1]), [1, tf.shape(self.with_depot_state)[1]]), [-1]), -1),
                                     tf.reshape(self.training_index, [-1, 1])], -1)
        if Config.STATE_EMBED == 0:
            self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids),
                                              [self.batch_size, tf.shape(self.with_depot_state)[1], 2])
        else:
            self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids),
                                              [self.batch_size, tf.shape(self.with_depot_state)[1], Config.RNN_HIDDEN_DIM])
        train_helper, pred_helper = Helper(self.with_depot_state, self.batch_size, self.training_inputs,
                                           self.start_tokens, self.end_token)

        # DECODER
        if Config.DIRECTION < 6:
            train_decoder, pred_decoder, critic_network_pred = Decoder(self.batch_size, self.encoder_state, self.encoder_outputs,
                                                                       train_helper, pred_helper, self.state, self.start_tokens,
                                                                       self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE)

            self.train_final_output, self.train_final_state, train_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                train_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1])
            self.train_final_action = self.train_final_output.sample_id

            self.pred_final_output, self.pred_final_state, pred_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                pred_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1])
            self.pred_final_action = self.pred_final_output.sample_id
            self.base_line_est = critic_network_pred
            self.logits = self.train_final_output.rnn_output

        if Config.DIRECTION == 6:
            self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Beam_Search(
                self.batch_size, self.encoder_state, self.encoder_outputs,
                train_helper, pred_helper, self.with_depot_state, self.start_tokens,
                self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE)
            # self.pred_final_action = tf.squeeze(self.pred_final_action)

        if Config.DIRECTION == 9:
            self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Reza_Model(self.batch_size,
                                                                                                          self.with_depot_state)
        if Config.DIRECTION == 10:
            self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Wyatt_Model(self.batch_size,
                                                                                                           self.state,
                                                                                                           self.raw_state)
        self.probs = self.logits
        self.probs = self.probs + tf.to_float(tf.less(self.probs, -.8*Config.LOGIT_PENALTY))*Config.LOGIT_PENALTY
        self.probs = tf.clip_by_value(tf.nn.softmax(self.probs), 1e-7, 1e7)
        gather_ind = tf.concat([
            tf.reshape(tf.tile(tf.reshape(tf.range(0, self.batch_size), [-1, 1]), [1, Config.NUM_OF_CUSTOMERS]), [-1, 1]),
            tf.tile(tf.reshape(tf.range(0, Config.NUM_OF_CUSTOMERS), [-1, 1]), [self.batch_size, 1]),
            tf.reshape(self.pred_final_action, [-1, 1])], axis=1)
        self.new_probs_with_pi = tf.reshape(tf.gather_nd(self.probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS])
        self.old_probs_with_pi = tf.reshape(tf.gather_nd(self.old_probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS])
        self.ratio = tf.divide(self.new_probs_with_pi, self.old_probs_with_pi)

        if DECODER_TYPE == 0:
            # x = tf.range(0, 19, dtype=tf.int32)
            # x = [tf.random_shuffle(x)]

            # for i in range(499):
            #     y = tf.range(0, 19, dtype=tf.int32)
            #     y = [tf.random_shuffle(y)]
            #     x = tf.concat((x, y), axis=0)
            # self.pred_final_action = x[:self.batch_size, :]
            if Config.SEQUENCE_COST == 0:
                self.critic_loss = tf.losses.mean_squared_error(self.sampled_cost, self.base_line_est)
            else:
                self.critic_loss = tf.losses.mean_squared_error(tf.reshape(self.sampled_cost[:, 0], [-1, 1]), self.base_line_est)

            if Config.LOGIT_CLIP_SCALAR != 0:
                self.logits = Config.LOGIT_CLIP_SCALAR*tf.nn.tanh(self.logits)

            if Config.REINFORCE == 0:
                # self.weights = tf.to_float(tf.tile(tf.reshape(tf.range(
                #     1, tf.divide(1, tf.shape(self.state)[1]), -tf.divide(1, tf.shape(self.state)[1])),
                #                                               [1, -1]), [self.batch_size, 1]))
                self.actor_loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.logits,
                    targets=self.or_route[:, :-1],
                    weights=tf.ones([self.batch_size, tf.shape(self.state)[1]])
                    # weights=self.weights
                )
            else:
                self.neg_log_prob = -1*tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                                      labels=self.train_final_action)
                self.R = tf.stop_gradient(self.sampled_cost)
                if Config.SEQUENCE_COST == 1 and Config.USE_PPO == 0:
                    assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001)
                    with tf.control_dependencies([assign]):
                        V = self.MA_baseline
                        self.actor_loss = tf.reduce_mean(tf.multiply(self.neg_log_prob, self.R-V))
                elif Config.USE_PPO == 1:
                    assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001)
                    with tf.control_dependencies([assign]):
                        V = self.MA_baseline
                        adv = self.R - V
                        epsilon = 0.1
                        self.actor_loss = -tf.reduce_mean(tf.reduce_sum(
                            tf.minimum(tf.multiply(self.ratio, adv),
                                       tf.clip_by_value(self.ratio, 1.0-epsilon, 1.0+epsilon)*adv), axis=1))
                elif Config.MOVING_AVERAGE == 1:
                    assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R)*.001)
                    with tf.control_dependencies([assign]):
                        V = self.MA_baseline
                        self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V))
                elif Config.USE_OR_COST == 1:
                    V = tf.stop_gradient(self.or_cost)
                    self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), (self.R-V)/5))
                else:
                    V = tf.stop_gradient(self.base_line_est)
                    self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V))

            with tf.name_scope("Train"):
                if Config.GPU == 1:
                    colocate = True
                else:
                    colocate = False
                if Config.LR_DECAY_OFF == 0:
                    self.lr = tf.train.exponential_decay(
                        Config.LEARNING_RATE, self.global_step, 200000,
                        .9, staircase=True, name="learning_rate")
                else:
                    self.lr = Config.LEARNING_RATE
                self.train_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss)
                if Config.MAX_GRAD != 0:
                    self.params = tf.trainable_variables()
                    self.gradients = tf.gradients(self.actor_loss, self.params, colocate_gradients_with_ops=colocate)
                    opt = tf.train.AdamOptimizer(self.lr)
                    self.clipped_gradients, gradient_norm = tf.clip_by_global_norm(self.gradients, Config.MAX_GRAD)
                    self.train_actor_op = opt.apply_gradients(zip(self.clipped_gradients, self.params), global_step=self.global_step)
                    tf.summary.scalar("grad_norm", gradient_norm)
                    tf.summary.scalar("LearningRate", self.lr)
                else:
                    self.train_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss,
                                                                                   global_step=self.global_step,
                                                                                   colocate_gradients_with_ops=colocate)
            # # for gradient clipping https://github.com/tensorflow/nmt/blob/master/nmt/model.py

            with tf.name_scope("Loss"):
                tf.summary.scalar("Loss", self.actor_loss)
                tf.summary.scalar("Critic_Loss", self.critic_loss)
            with tf.name_scope("Performace"):
                tf.summary.scalar("Relative Critic Loss", tf.reduce_mean(self.base_line_est/self.or_cost))
                tf.summary.scalar("Relative Critic Loss to Sampled", tf.reduce_mean(self.base_line_est/self.sampled_cost))
                tf.summary.scalar("difference_in_length", self.difference_in_length)
                tf.summary.scalar("relative_length", self.relative_length)
                tf.summary.scalar("Avg_or_cost", tf.reduce_mean(self.or_cost))
                if Config.SEQUENCE_COST == 0:
                    tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost))
                else:
                    tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost[:, 0]))
                # tf.summary.histogram("LocationStartDist", tf.transpose(self.pred_final_action, [1, 0])[0])
                # tf.summary.histogram("LocationEndDist", tf.transpose(self.pred_final_action, [1, 0])[-1])
            with tf.name_scope("Config"):
                tf.summary.scalar("REINFORCE", Config.REINFORCE)
                tf.summary.scalar("DIRECTION", Config.DIRECTION)
                tf.summary.scalar("NUM_OF_CUSTOMERS", Config.NUM_OF_CUSTOMERS)
                tf.summary.scalar("StateEmbed", tf.cast(Config.STATE_EMBED, tf.int32))
                tf.summary.scalar("MAX_GRAD", Config.MAX_GRAD)
                tf.summary.scalar("LogitPen", Config.LOGIT_PENALTY)
                tf.summary.scalar("batch_size", self.batch_size)
                tf.summary.scalar("Config.LAYERS_STACKED_COUNT", Config.LAYERS_STACKED_COUNT)
                tf.summary.scalar("RNN_HIDDEN_DIM", Config.RNN_HIDDEN_DIM)
                tf.summary.scalar("RUN_TIME", Config.RUN_TIME)
                tf.summary.scalar("LOGIT_CLIP_SCALAR", Config.LOGIT_CLIP_SCALAR)
                tf.summary.scalar("Droput", tf.cast(Config.DROPOUT, tf.int32))
                tf.summary.scalar("GPU", Config.GPU)
示例#9
0
文件: Train.py 项目: bothe/NMT
    def __init__(self, **kwargs):
        dataset_folder = Path(kwargs["dataset_folder"]).resolve()
        check_valid_path(dataset_folder)
        result_folder = kwargs["result_folder"]

        self.initial_epoch = 1
        self.test_mode = kwargs["test"]
        self.epochs = kwargs["epochs"]
        self.use_label_smoothing = kwargs["label_smoothing"]

        self.ckpt_path = kwargs["ckpt_path"]
        self.ckpt_epoch = kwargs["ckpt_epoch"]

        # model에 필요한 폴더 및 파일 생성
        self.log_folder, self.ckpt_folder, self.image_folder = create_folder(
            result_folder)
        if not self.test_mode:
            self.training_result_file = self.log_folder / "training_result.txt"
        self.test_result_file = None

        # kwargs 값 저장
        msg = ""
        for k, v in list(kwargs.items()):
            msg += "{} = {}\n".format(k, v)
        msg += "new model checkpoint path = {}\n".format(self.ckpt_folder)
        with (self.log_folder / "model_settings.txt").open(
                "w", encoding="utf-8") as fp:
            fp.write(msg)

        # 필요한 data를 불러옴
        self.src_word2id, self.src_id2word, self.src_vocab_size = load_word_dic(
            dataset_folder / "src_word2id.pkl")
        self.tar_word2id, self.tar_id2word, self.tar_vocab_size = load_word_dic(
            dataset_folder / "tar_word2id.pkl")

        if not self.test_mode:
            train_src, num_train_src = get_dataset(
                self.src_word2id, dataset_folder / "train_src.txt", False,
                True, True)
            train_tar, num_train_tar = get_dataset(
                self.tar_word2id, dataset_folder / "train_tar.txt", True, True,
                True)
            if num_train_src != num_train_tar:
                raise Exception(
                    "source 데이터셋({})과 target 데이터셋({})의 크기가 다릅니다.".format(
                        num_train_src, num_train_tar))

            self.num_train = num_train_src
            self.train_dataset = tf.data.Dataset.from_generator(
                lambda: zip(train_src, train_tar), (tf.int32, tf.int32))
            self.train_dataset = self.train_dataset.cache().shuffle(
                self.num_train + 1).padded_batch(
                    batch_size=kwargs["batch_size"],
                    padded_shapes=(tf.TensorShape([None]),
                                   tf.TensorShape([None])),
                    padding_values=(self.src_word2id["<PAD>"],
                                    self.tar_word2id["<PAD>"])).prefetch(1)

        test_src_path = dataset_folder / "test.txt"
        if test_src_path.exists():
            test_src, self.num_test = get_dataset(self.src_word2id,
                                                  test_src_path, False, True,
                                                  False)

            # self.test_src_max_len = max([len(sentence) for sentence in test_src])
            # padded_test_src = tf.keras.preprocessing.sequence.pad_sequences(
            #    test_src, maxlen = self.test_src_max_len, padding = 'post',
            #    dtype = 'int32', value = self.src_word2id["<PAD>"])

            self.test_dataset = tf.data.Dataset.from_generator(
                lambda: test_src, tf.int32)
            self.test_dataset = self.test_dataset.cache().batch(1).prefetch(1)
            self.test_result_file = self.log_folder / "test_result.txt"

        elif self.test_mode:
            raise FileNotFoundError(
                "[ {} ] 경로가 존재하지 않습니다.".format(test_src_path))

        self.encoder = Encoder(self.src_vocab_size, kwargs["embedding_size"],
                               kwargs["hidden_size"], kwargs["dropout_rate"],
                               kwargs["gru"], kwargs["bi"])
        self.decoder = Decoder(self.tar_vocab_size, kwargs["embedding_size"],
                               kwargs["hidden_size"], kwargs["attention_size"],
                               kwargs["dropout_rate"], kwargs["gru"],
                               kwargs["bi"])

        # 아래 line 6줄은 colab에서 한글 깨짐을 방지하기 위한 부분으로 생략해도 됩니다.
        # %config InlineBackend.figure_format = 'retina'
        # !apt -qq -y install fonts-nanum
        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        plt.rc('font', family='NanumBarunGothic')
        mpl.font_manager._rebuild()
示例#10
0
class Trainer:
    def __init__(self, dataset, args):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.dataset = dataset
        self.encoder = ConvEncoder(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device)
        self.decoder = Decoder(self.dataset.num_ent(), self.dataset.num_rel() , self.device)
        self.discriminator = ComplEx(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device)
        
        self.args = args
        self.adversarial_loss = nn.BCEWithLogitsLoss()
        self.reconstruction_loss = nn.BCELoss()


        
    def train(self):
        self.encoder.train()
        self.decoder.train()
        self.discriminator.train()
        print('entity', self.dataset.num_ent(), 'relation', self.dataset.num_rel())
        print('ConvEncoder')
        print('train_simple1')
        print('epoch', self.args.ne)
        print('D_lr',self.args.D_lr)
        print('G_lr',self.args.G_lr)
        print('emb_dim',self.args.emb_dim)
        print('batch_size',self.args.batch_size)
        print('discriminator_range',self.args.discriminator_range)

        entity_onehot = []
        relation_onehot = []

        for i in range(self.dataset.num_ent()):
            onehot = [0 for x in range(self.dataset.num_ent())]
            onehot[i] = 1
            entity_onehot.append(onehot)

        for i in range(self.dataset.num_rel()):
            onehot = [0 for x in range(self.dataset.num_rel())]
            onehot[i] = 1
            relation_onehot.append(onehot)


#********************************admgrad*********************************************************************
        optimizer_D = torch.optim.Adagrad(
            self.discriminator.parameters(),
            lr = self.args.D_lr,
            weight_decay= 0,
            initial_accumulator_value= 0.1)

        optimizer_Encoder = torch.optim.Adagrad(
            self.encoder.parameters(),
            lr = self.args.G_lr,
            weight_decay= 0,
            initial_accumulator_value= 0.1)
        optimizer_Decoder = torch.optim.Adagrad(
            self.decoder.parameters(),
            lr = self.args.G_lr,
            weight_decay= 0,
            initial_accumulator_value= 0.1)

        for epoch in range(1, self.args.ne+1):

            # start_time = time.time()

            last_batch = False
            total_d_loss = 0.0
            total_g_loss = 0.0
            while not last_batch:
                
                pos_batch = self.dataset.next_pos_batch(self.args.batch_size)
                last_batch = self.dataset.was_last_batch()

                h_onehot = []
                r_onehot = []
                t_onehot = []
                for i in pos_batch[:,0]:
                    one_hot = entity_onehot[i]
                    h_onehot.append(one_hot)
                for i in pos_batch[:,2]:
                    one_hot = entity_onehot[i]
                    t_onehot.append(one_hot)
                for i in pos_batch[:,1]:
                    one_hot = relation_onehot[i]
                    r_onehot.append(one_hot)

                h = torch.tensor(h_onehot).float().to(self.device)
                r = torch.tensor(r_onehot).float().to(self.device)
                t = torch.tensor(t_onehot).float().to(self.device)


                # -----------------
                #  Train Generator
                # ----------------
                optimizer_Encoder.zero_grad()
                optimizer_Decoder.zero_grad()

                encoder_batch = np.repeat(np.copy(pos_batch), 1, axis=0)
                for i in range(self.args.batch_size):
                    if np.random.random()<0.5:
                        encoder_batch[i][0] = pos_batch[i][0]
                        encoder_batch[i][1] = 0
                        encoder_batch[i][2] = pos_batch[i][1]
                    else:
                        encoder_batch[i][0] = pos_batch[i][2]
                        encoder_batch[i][1] = 1
                        encoder_batch[i][2] = pos_batch[i][1]

                encoder_h_onehot = []
                encoder_r_onehot = []
                encoder_position = []

                for i in encoder_batch[:,0]:
                    one_hot = entity_onehot[i]
                    encoder_h_onehot.append(one_hot)
                for i in encoder_batch[:,1]:
                    encoder_position.append([i])
                for i in encoder_batch[:,2]:
                    one_hot = relation_onehot[i]
                    encoder_r_onehot.append(one_hot)


                encoder_h = torch.tensor(encoder_h_onehot).float().to(self.device)
                encoder_p = torch.tensor(encoder_position).float().to(self.device)
                encoder_r = torch.tensor(encoder_r_onehot).float().to(self.device)


                fake_tails =self.encoder(encoder_h, encoder_p, encoder_r)
                construction_heads, construction_postions, construction_rels = self.decoder(fake_tails)

                g_loss = self.reconstruction_loss(construction_heads, encoder_h) + self.reconstruction_loss(construction_rels, encoder_r) + self.reconstruction_loss(construction_postions, encoder_p)

                g_loss.backward()
                total_g_loss += g_loss.cpu().item()
                optimizer_Encoder.step() 
                optimizer_Decoder.step()

                neg_batch = np.repeat(np.copy(pos_batch), self.args.neg_ratio, axis=0)
                for _ in range(self.args.discriminator_range):
                    neg_entity = []
                    for i in range(len(neg_batch)):
                        if np.random.random() < 0.5:
                            temp = []
                            temp_h = pos_batch[i][0]
                            temp_p = [0]
                            temp_r = pos_batch[i][1]
                            
                            temp.append(temp_h)
                            temp.append(temp_p)
                            temp.append(temp_r)
                            neg_entity.append(temp)

                        else:
                            temp = []
                            temp_h = pos_batch[i][2]
                            temp_p = [1]
                            temp_r = pos_batch[i][1]
                            
                            temp.append(temp_h)
                            temp.append(temp_p)
                            temp.append(temp_r)
                            neg_entity.append(temp)
                    
                    temp_h_one_hot = []
                    temp_r_one_hot = []
                    temp_p = []

                    for ele in neg_entity:
                        temp_h_one_hot.append(entity_onehot[ele[0]])
                        temp_r_one_hot.append(relation_onehot[ele[2]])
                        temp_p.append(ele[1])

                    temp_h_one_hot = torch.tensor(temp_h_one_hot).float().to(self.device)
                    temp_r_one_hot = torch.tensor(temp_r_one_hot).float().to(self.device)
                    temp_p = torch.tensor(temp_p).float().to(self.device)
                    neg_tails_index = np.argmax(self.encoder(temp_h_one_hot, temp_p, temp_r_one_hot).cpu().data.numpy(), axis=1)

                    for i in range(len(neg_batch)):
                        if neg_entity[i][1] == [0]:
                            neg_batch[i][2] = neg_tails_index[i]
                        elif neg_entity[i][1] == [1]:
                            neg_batch[i][0] = neg_tails_index[i]
                        else:
                            print('GG')
                    neg_batch[:,-1] = -1

                    batch = np.append(pos_batch, neg_batch, axis=0)
                    np.random.shuffle(batch)

                    full_h_onehot = []
                    full_r_onehot = []
                    full_t_onehot = []

                    for i in batch[:,0]:
                        one_hot = entity_onehot[i]
                        full_h_onehot.append(one_hot)

                    for i in batch[:,2]:
                        one_hot = entity_onehot[i]
                        full_t_onehot.append(one_hot)

                    for i in batch[:,1]:
                        one_hot = relation_onehot[i]
                        full_r_onehot.append(one_hot)

                    full_h = torch.tensor(full_h_onehot).float().to(self.device)
                    full_r = torch.tensor(full_r_onehot).float().to(self.device)
                    full_t = torch.tensor(full_t_onehot).float().to(self.device)
                    labels = torch.tensor(batch[:,3]).float().to(self.device)

                    optimizer_D.zero_grad()
                    scores = self.discriminator(full_h, full_r, full_t)
                    d_loss = torch.sum(F.softplus(-labels * scores)) + (self.args.reg_lambda * self.discriminator.l2_loss() / self.dataset.num_batch(self.args.batch_size))
                    d_loss.backward()
                    optimizer_D.step()

                    for p in self.discriminator.parameters():
                        p.data.clamp_(-1, 1)

                    total_d_loss += d_loss.cpu().item()

                # =================== generator training =======================
                optimizer_Encoder.zero_grad()
                fake_tails =self.encoder(encoder_h, encoder_p, encoder_r)
                generator_score = self.discriminator(encoder_h, encoder_r, fake_tails)

                G_loss = -0.2 * torch.mean(torch.log(generator_score + 1e-6))
                
                G_loss.backward()
                optimizer_Encoder.step()


            # finish_time = time.time()

            # with open("train_time_log.log",'a') as f:
            #     f.write(str(epoch)+"    "+str(start_time)+"    "+str(finish_time)+"\n")
                    
            print("Loss in iteration " + str(epoch) + ": " + str(total_d_loss) + "(" + self.dataset.name + ")")
            print("Loss in iteration " + str(epoch) + ": " + str(total_g_loss) + "(" + self.dataset.name + ")")
        
            if epoch % self.args.save_each == 0:
                self.save_model(epoch)

            if epoch % 25 == 0:
                print('epoch', epoch, scores)
                print('neg_batch', neg_batch[:,2])


    def save_model(self, chkpnt):
        print("Saving the model")
        directory = "models/" + self.dataset.name + "/" + 'complex' + "/" 
        if not os.path.exists(directory):
            os.makedirs(directory)
        torch.save(self.discriminator, directory + str(chkpnt) + ".chkpnt")
示例#11
0
def main(args, encoder=None, decoder=None):
    #Model dir
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    print("[INFO]Loading data")
    print("[INFO] Reading file:{}".format(args.vocab_path))
    #Get dataloader	batch_size, shuffle, num_workers
    dataloader = get_loader(
        args.vocab_path,  #'captions2.json',
        vocab=None,
        max_len=18,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers)

    print("[INFO]Creating models")
    #Models
    if encoder is None and decoder is None:
        encoder = EncoderCNN()
        decoder = Decoder(decoder_size=18)
    else:
        encoder = encoder.train()
        decoder = decoder.train()

    #Loss and optimiser
    loss_func = nn.CrossEntropyLoss()
    #params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimiser = torch.optim.Adam(params, lr=args.learning_rate)

    print("[INFO] Starting training loop")
    #Train the models
    start = time.time()
    savedmodel = False
    total_step = len(dataloader)
    for epoch in range(args.num_epochs):
        print("Epoch:{}/{}".format(epoch, args.num_epochs))
        prev_loss = 0
        for i, (images, captions, lengths) in enumerate(dataloader):

            #Feed forward, backwards and optimise
            features = encoder(images)
            features = features.long()
            outputs = decoder(features, captions, tgt_mask=None)
            loss = loss_func(outputs, captions)

            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimiser.step()

            if loss == 0.000:
                prev_loss = prev_loss + 1
                if prev_loss == 5:
                    print("Epoch: {}/{}--i:{}".format(epoch, args.num_epochs,
                                                      i))
                    print("Loss: {:.4f}".format(loss.item()))
                    save_models(decoder, encoder)
                    exit()

            if i % args.log_step == 0:
                print("Epoch: {}/{}--i:{}".format(epoch, args.num_epochs, i))
                print("Loss: {:.4f}".format(loss.item()))
                print("[INFO] Time elapsed: {}".format(time.time() - start))

            #used `savedmodel` to save model once during epoch
            if (i + 1) % args.save_step == 0 and savedmodel is False:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))

    print("[INFO] Time elapsed: {}".format(time.time() - start))
    print("[INFO] Exiting")
示例#12
0
    parser.add_argument('--num_epochs',
                        type=int,
                        default=10,
                        help='Total epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=1,
                        help='dataloader batch size')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='num of workers for model dataloader')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='learning rate for models')
    args = parser.parse_args()

    if args.encoder_path is None and args.decoder_path is None:
        main(args)
    else:
        print("[INFO] Creating and Loading models")
        #Models
        encoder = EncoderCNN()
        decoder = Decoder(decoder_size=18)

        #Load trained models
        encoder.load_state_dict(torch.load(args.encoder_path))
        decoder.load_state_dict(torch.load(args.decoder_path))
        main(args, encoder, decoder)
示例#13
0
    Ps = args2Ps(args)
    #-------------------------------------------------------------------------
    # Vocabulary
    #-------------------------------------------------------------------------
    vocab = Vocabulary()
    vocab.make(dataset="flickr8k", min_word_freq=5)

    #-------------------------------------------------------------------------
    # models
    #-------------------------------------------------------------------------
    encoder = Encoder()
    encoder.fine_tune(Ps["fine_tune_encoder"])
    decoder = Decoder(attention_dim   = Ps["attention_dim"],
                      embed_dim       = Ps["embed_dim"],
                      decoder_dim     = Ps["decoder_dim"],
                      encoder_dim     = encoder.encoder_dim,
                      vocab_size      = len(vocab),
                      device          = Ps["device"],
                      dropout         = Ps["dropout"] )
    encoder = encoder.to(Ps["device"])
    decoder = decoder.to(Ps["device"])
    # whether to load a saved state_dict from checkpoint file
    if Ps["parent"] is not None:
        pass

    #-------------------------------------------------------------------------
    # optimizer and scheduler
    #-------------------------------------------------------------------------
    optimizer = get_optimizer(Ps, encoder, decoder)
    scheduler = Scheduler(optimizer, [None,None])
    #-------------------------------------------------------------------------
示例#14
0
def train(train_dataset,
          validation_dataset=None,
          iterations=150,
          hidden_size=64,
          batch_size=16):
    print("Training...")
    train = DataLoader(train_dataset,
                       batch_size=batch_size,
                       shuffle=True,
                       collate_fn=collate)
    validation = DataLoader(validation_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            collate_fn=collate)

    encoder = Encoder(1, hidden_size).to(device)
    decoder = Decoder(hidden_size, 1).to(device)

    encoder_optimizer = optim.Adam(encoder.parameters())
    decoder_optimizer = optim.Adam(decoder.parameters())
    criterion = nn.MSELoss()

    train_losses = []
    validation_losses = []

    for iter in range(iterations):
        encoder.train()
        decoder.train()

        loss_acc = 0
        for input_tensor, target_tensor, _, max_len, lens in train:
            _, encoder_hidden = encoder(input_tensor, None)
            decoder_hidden = encoder_hidden

            decoder_input = target_tensor[:, 0].view(batch_size, 1, 1)
            outputs = torch.zeros(batch_size, max_len)

            for di in range(1, max_len):
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
                outputs[:, di] = decoder_output.view(batch_size)
                decoder_input = decoder_output.detach()

            for i in range(len(lens)):
                outputs[i, lens[i]:] = 0
            """ if iter == iterations-1:
                print(target_tensor[:,1:].squeeze())
                print(outputs[:,1:].squeeze())
                print() """

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            batch_loss = criterion(outputs[:, 1:].squeeze(),
                                   target_tensor[:, 1:].squeeze())
            batch_loss.backward(retain_graph=True)
            loss_acc += batch_loss.item()

            encoder_optimizer.step()
            decoder_optimizer.step()

        train_losses.append(loss_acc)

        with torch.no_grad():
            val_loss_acc = 0
            for input_tensor, target_tensor, _, max_len, lens in validation:
                val_batch_size = len(target_tensor)

                _, encoder_hidden = encoder(input_tensor)
                decoder_hidden = encoder_hidden

                decoder_input = target_tensor[:, 0].view(val_batch_size, 1, 1)
                decoder_hidden = encoder_hidden
                outputs = torch.zeros(val_batch_size, max_len)

                for di in range(1, max_len):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden)
                    outputs[:, di] = decoder_output.view(val_batch_size)
                    decoder_input = decoder_output

                for i in range(len(lens)):
                    outputs[i, lens[i]:] = 0

                val_loss = criterion(outputs[:, 1:].squeeze(),
                                     target_tensor[:, 1:].squeeze())
                val_loss_acc += val_loss.item()

            validation_losses.append(val_loss_acc)

        if iter % 1 == 0:
            print("Iteration:", iter, " Train loss: ",
                  "{0:.5f}".format(loss_acc / len(train)),
                  " Validation loss: ",
                  "{0:.5f}".format(validation_losses[-1]))

    showPlot(train_losses, validation_losses)
    torch.save(encoder, "models/encoder.pt")
    torch.save(decoder, "models/decoder.pt")
 def make_std_mask(tgt, pad):
     "创建一个mask来隐藏填充和将来的单词"
     tgt_mask = (tgt != pad).unsqueeze(-2)
     tgt_mask = tgt_mask & Variable(
         Decoder.subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
     return tgt_mask