def get_model(): embedin = Embedin(vocab_size, embed_size) embedout = Embedout(vocab_size, hidden_size) enc = Encoder(embedin, embed_size, hidden_size, n_layers) dec5 = Decoder(embedin, embed_size, hidden_size, n_layers) #enc7 = Encoder(embedin, embed_size, hidden_size, n_layers) dec7 = Decoder(embedin, embed_size, hidden_size, n_layers) atten = Attention(hidden_size) #atten7 = Attention(hidden_size) ae5 = Autoencoder(enc, dec5, atten, embedout, 13) ae7 = Autoencoder(enc, dec7, atten, embedout, 17) discriminator = Discriminator(hidden_size) discriminator2 = Discriminator2(vocab_size, embed_size, hidden_size) seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17) seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13) lm5 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0) lm7 = Lstm(vocab_size, embed_size, hidden_size, n_layers, drop_out=0) lm5.load_state_dict(torch.load('models/lm5_lstm_dropout.th')) lm7.load_state_dict(torch.load('models/lm7_lstm_dropout.th')) ae5 = ae5.cuda() ae7 = ae7.cuda() discriminator = discriminator.cuda() discriminator2 = discriminator2.cuda() seq2seq57 = Autoencoder(enc, dec7, atten, embedout, 17) seq2seq75 = Autoencoder(enc, dec5, atten, embedout, 13) lm5 = lm5.cuda() lm7 = lm7.cuda() return ae5, ae7, discriminator, discriminator2, seq2seq57, seq2seq75, lm5, lm7
def __init__(self, dataset, args): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.dataset = dataset self.encoder = ConvEncoder(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device) self.decoder = Decoder(self.dataset.num_ent(), self.dataset.num_rel() , self.device) self.discriminator = ComplEx(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device) self.args = args self.adversarial_loss = nn.BCEWithLogitsLoss() self.reconstruction_loss = nn.BCELoss()
def __init__(self, inp_dim, out_dim, emb_dim, enc_hid, dec_hid, enc_drop, dec_drop, epoch, clip, sparse_max, tf, max_length, vocab, batch, device): self.inp_dim = inp_dim self.out_dim = out_dim self.emb_dim = emb_dim self.enc_hid = enc_hid self.dec_hid = dec_hid self.enc_drop = enc_drop self.dec_drop = dec_drop self.tf = tf self.max_length = max_length self.batch = batch self.device = device self.vocab = vocab self.attn = Attention(enc_hid, dec_hid, sparse_max=sparse_max) self.enc = Encoder(inp_dim, emb_dim, enc_hid, dec_hid, enc_drop) self.dec = Decoder(out_dim, emb_dim, enc_hid, dec_hid, dec_drop, self.attn) self.model = Seq2Seq(self.enc, self.dec, device).to(device) self.model.apply(self.init_weights) self.count_parameters() self.optimizer = optim.Adam(self.model.parameters()) if sparse_max: self.criterion = SparsemaxLoss(ignore_index=0) else: self.criterion = nn.CrossEntropyLoss(ignore_index=0) # pad_idx 0 self.epoch = epoch self.clip = clip
train_iterator, valid_iterator, test_iterator, SRC, TRG = Return_Data_Loaders() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) ########################################################################## model.apply(init_weights) optimizer = optim.Adam(model.parameters()) TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) ##########################################################################
def main(args): dataset = args.dataset emb_output_dir = args.output epochs = args.epochs agg = args.agg p = args.p tr = args.tr lam = args.lam lose_func = args.loss # Preprocess dataset adj, views_features = load_data(dataset, num_views=3) adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # Calculate pairwise simlarity. views_sim_matrix = {} views_feature_matrix = {} for view in list(views_features.keys()): feature_matrix = csc_matrix.todense(views_features[view]) views_feature_matrix.update({view:feature_matrix}) kernal = "rbf" if lose_func == 'all': attr_sim = cal_attr_sim(views_feature_matrix, dataset) else: attr_sim = 0 # split nodes to train, valid and test datasets, # remove test edges from train adjacent matrix. adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(dataset, adj) print("Masking edges Done!") adj = adj_train nx_G = nx.from_numpy_array(adj.toarray()) num_nodes = adj.shape[0] adj_norm = preprocess_graph(adj) views_features_num = {} views_features_nonzero = {} for view in list(views_features.keys()): views_features[view] = sparse_to_tuple(views_features[view].tocoo()) views_features_num.update({view:views_features[view][2][1]}) views_features_nonzero.update({view:views_features[view][1].shape[0]}) # Build model MagCAE = {} for view in list(views_features.keys()): x,y = views_features[view][2][0], views_features[view][2][1] model = GAE(y, views_features_nonzero[view], adj_norm, math.ceil(2*p*y), math.ceil(p*y)) MagCAE.update({view:model}) # Loss function and optimizer. # loss weight taken by each nodes to the total loss. pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) /adj.sum() norm = adj.shape[0] * adj.shape[0] / float(adj.shape[0] * adj.shape[0] - adj.sum())*2 optimizer = tf.keras.optimizers.Adam() adj_targ = adj_train + sp.eye(adj_train.shape[0]) adj_targ = sparse_to_tuple(adj_targ) indices= np.array(adj_targ[0]) values = np.array(adj_targ[1]) dense_shape = np.array(adj_targ[2]) sparse_targ = tf.SparseTensor(indices = indices, values = values, dense_shape = dense_shape) sparse_targ = tf.cast(sparse_targ, dtype=tf.float32) adj_targ = tf.sparse.to_dense(sparse_targ) adj_targ = tf.reshape(adj_targ,[-1]) # Train and Evaluate Model # Training Loop: # In each epoch: views - > view_embedding -> aggregate embedding -> total loss -> update gradients decoder = Decoder(100) for epoch in range(epochs): loss = 0 start = time.time() with tf.GradientTape() as tape: ag_embedding ={} for VAE in list(MagCAE.keys()): v_embedding, a_hat = MagCAE[VAE](views_features[VAE]) ag_embedding.update({VAE:v_embedding}) # aggregate embeddings embedding, aggregator = aggregate_embeddings(ag_embedding, agg) # reconstruct a_hat a_hat = decoder(embedding) loss += loss_function(a_hat, adj_targ, pos_weight, norm, attr_sim, embedding, num_nodes, lam, lose_func) if agg == "weighted_concat": variables = MagCAE['view1'].trainable_variables + MagCAE['view2'].trainable_variables + MagCAE['view3'].trainable_variables + aggregator.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) # Evaluate on validate set embedding = np.array(embedding) roc_cur, ap_cur, _, _ = evaluate(val_edges, val_edges_false, adj_orig, embedding) print("Epoch {}: Val_Roc {:.4f}, Val_AP {:.4f}, Time Consumed {:.2f} sec\n".format(epoch+1, roc_cur, ap_cur, time.time()-start)) print("Training Finished!") # Evaluation Result on test Edges test_embedding= {} for VAE in list(MagCAE.keys()): v_embedding, a_hat = MagCAE[VAE](views_features[VAE]) test_embedding.update({VAE:v_embedding}) # aggregate embeddings embedding, aggregator = aggregate_embeddings(test_embedding, agg) embedding = np.array(embedding) # embedding is a tensor, convert to np array. # reconstruct a_hat test_roc, test_ap, fpr, tpr = evaluate(test_edges, test_edges_false, adj_orig, embedding) print("MagCAE test result on {}".format(dataset)) print("Test Roc: {}, Test AP: {}, P: {}, Training Ratio: {}, Lambda: {}.".format(test_roc, test_ap, p, tr, lam))
import pickle from torch.utils.data import DataLoader from my_dataloader import * from create_vocabulary import * from Model import Encoder, Decoder, Seq2Seq import torch.nn as nn import torch.optim as optim from torch.optim.lr_scheduler import StepLR device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #encoder = Encoder(input_dim=2999, name='emb_inspec.npy') #decoder = Decoder(output_dim=2999, name='emb_inspec.npy') encoder = Encoder() decoder = Decoder() model = Seq2Seq(encoder, decoder, device).to(device) #model.load_state_dict(torch.load('train.pt')) def init_weights(m): for name, param in m.named_parameters(): nn.init.normal_(param.data, mean=0, std=0.01) batch=64 tot_epoch = 100 vocab = np.load('vocab_kp20k2.npy', allow_pickle=True).item() #vocab = np.load('vocab_inspec.npy', allow_pickle=True).item() TRG_PAD_IDX = vocab('<pad>') criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
def q4(training_images, epochs=1000, latent_dim=10, batch_size=500): """ The main function of question 4. Trains a GLO model to map vectors from the unit sphere in latent_dim-dimensions to mnist digits. :param training_images: A training set of mnist digits whose distribution we attempt to learn. :param epochs: Number of training epochs. :param latent_dim: Dimension of the latent vectors in which we wish to learn the distribution. :param batch_size: Size of a single batch in training. """ model = Decoder() # Creating noise vectors in latent_dim dimensions, and projecting them on the unit sphere. noise_data = np.random.normal(size=(training_images.shape[0], latent_dim)).astype('float32') noise_data = noise_data / (np.linalg.norm(noise_data, axis=1)[..., np.newaxis]) # Creating training batches. training_batches = tf.data.Dataset.from_tensor_slices( (training_images, noise_data)).batch(batch_size) # Defining optimizers, and a loss metric. z_optimizer = tf.keras.optimizers.Adam(1e-2) glo_optimizer = tf.keras.optimizers.Adam(1e-3) training_loss_metric = tf.keras.metrics.Mean() training_loss = list() shown = False # Training loop. for epoch in range(1, epochs + 1): for original_images_batch, noise_batch in training_batches: # Representing the noise batch as a tensorflow variable so we could take the derivative of the loss w.r.t the batch. noise_vector_batch = tf.Variable(noise_batch) with tf.GradientTape() as z_tape, tf.GradientTape() as glo_tape: generated_images_batch = model(noise_vector_batch) z_loss = tf.square( tf.norm(original_images_batch - generated_images_batch, axis=1)) glo_loss = tf.reduce_mean(z_loss) z_gradients = z_tape.gradient(z_loss, [ noise_vector_batch, ]) z_optimizer.apply_gradients( zip(z_gradients, [ noise_vector_batch, ])) glo_gradients = glo_tape.gradient(glo_loss, model.trainable_variables) glo_optimizer.apply_gradients( zip(glo_gradients, model.trainable_variables)) training_loss_metric(glo_loss) # After applying the gradients, we need to update the original batch, which we want to train. We also project the resulting vectors to the unit sphere. noise_batch = noise_vector_batch / (tf.norm( noise_vector_batch, axis=1)[..., tf.newaxis]) # Visualization of the progress is done using training data, and thus placed in the loop. if not shown: if (epoch in [1, epochs ]) or (epochs < 5) or not (epoch % (epochs // 5)): fig = plt.figure() plt.title('GLO Training Progress Visualization \nepoch {}'. format(epoch)) plt.xticks([]) plt.yticks([]) noise = noise_batch[:3, :] images = model(noise) for i in range(1, 4): fig.add_subplot(1, 3, i) plt.xticks([]) plt.yticks([]) plt.imshow(images[i - 1, :, :, 0], cmap='gray') shown = True training_loss.append(training_loss_metric.result()) shown = False print("epoch {}: training loss - {} ".format(epoch, training_loss[-1])) plot_losses(training_loss, []) for original_images_batch, noise_batch in training_batches: display_generating_examples(model, noise_batch[:5, :]) break
def _create_graph(self, DECODER_TYPE): self.raw_state = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS+1, 2], name='State') self.current_location = self.raw_state[:, -1] self.sampled_cost = tf.placeholder(tf.float32, [None, 1], name='Sampled_Cost') if Config.SEQUENCE_COST == 1: self.sampled_cost = tf.placeholder(tf.float32, [None, Config.NUM_OF_CUSTOMERS], name='Sampled_Cost') self.batch_size = tf.shape(self.raw_state)[0] self.keep_prob = tf.placeholder(tf.float32) self.global_step = tf.Variable(0, trainable=False, name='step') self.input_lengths = tf.convert_to_tensor([Config.NUM_OF_CUSTOMERS]*(self.batch_size)) self.or_route = tf.placeholder(tf.int32, shape=[None, Config.NUM_OF_CUSTOMERS+1]) self.or_cost = tf.placeholder(tf.float32, shape=[None, 1]) self.difference_in_length = tf.reduce_mean(self.sampled_cost - self.or_cost) self.relative_length = tf.reduce_mean(self.sampled_cost/self.or_cost) if Config.SEQUENCE_COST == 1: self.relative_length = tf.reduce_mean(self.sampled_cost[:, 0]/self.or_cost) self.start_tokens = tf.placeholder(tf.int32, shape=[None]) self.end_token = -1 self.MA_baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) if Config.SEQUENCE_COST == 1: self.MA_baseline = tf.Variable(tf.tile([0.0], [Config.NUM_OF_CUSTOMERS]), dtype=tf.float32, trainable=False) self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost, axis=0)) else: self.assign_init_MA = tf.assign(self.MA_baseline, tf.reduce_mean(self.sampled_cost)) if Config.STATE_EMBED == 1: self.with_depot_state = self.raw_state for i in range(0): self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1, padding="SAME", activation=tf.nn.relu) self.with_depot_state = tf.layers.conv1d(self.with_depot_state, Config.RNN_HIDDEN_DIM, 1, padding="VALID") else: self.with_depot_state = self.raw_state self.state = self.with_depot_state[:, :-1, :] self.old_probs = tf.placeholder(tf.float32, shape=[None, Config.NUM_OF_CUSTOMERS, Config.NUM_OF_CUSTOMERS]) # ENCODER if Config.DIRECTION == 4 or Config.DIRECTION == 5 or Config.DIRECTION == 6: self.encoder_outputs = self.state self.encoder_state = None if Config.DIRECTION < 6 and Config.DIRECTION != 4 and Config.DIRECTION != 5 and Config.DIRECTION != 6: self.encoder_outputs, self.encoder_state = Encoder(self.state, self.keep_prob) # HELPERS self.training_index = tf.concat([tf.expand_dims(self.start_tokens, -1), self.or_route], axis=1) self.training_index = self.training_index[:, :-1] self.gather_ids = tf.concat([tf.expand_dims( tf.reshape(tf.tile(tf.reshape(tf.range(self.batch_size), [-1, 1]), [1, tf.shape(self.with_depot_state)[1]]), [-1]), -1), tf.reshape(self.training_index, [-1, 1])], -1) if Config.STATE_EMBED == 0: self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids), [self.batch_size, tf.shape(self.with_depot_state)[1], 2]) else: self.training_inputs = tf.reshape(tf.gather_nd(self.with_depot_state, self.gather_ids), [self.batch_size, tf.shape(self.with_depot_state)[1], Config.RNN_HIDDEN_DIM]) train_helper, pred_helper = Helper(self.with_depot_state, self.batch_size, self.training_inputs, self.start_tokens, self.end_token) # DECODER if Config.DIRECTION < 6: train_decoder, pred_decoder, critic_network_pred = Decoder(self.batch_size, self.encoder_state, self.encoder_outputs, train_helper, pred_helper, self.state, self.start_tokens, self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE) self.train_final_output, self.train_final_state, train_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( train_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1]) self.train_final_action = self.train_final_output.sample_id self.pred_final_output, self.pred_final_state, pred_final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( pred_decoder, impute_finished=False, maximum_iterations=tf.shape(self.state)[1]) self.pred_final_action = self.pred_final_output.sample_id self.base_line_est = critic_network_pred self.logits = self.train_final_output.rnn_output if Config.DIRECTION == 6: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Beam_Search( self.batch_size, self.encoder_state, self.encoder_outputs, train_helper, pred_helper, self.with_depot_state, self.start_tokens, self.end_token, self.keep_prob, self.raw_state, DECODER_TYPE) # self.pred_final_action = tf.squeeze(self.pred_final_action) if Config.DIRECTION == 9: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Reza_Model(self.batch_size, self.with_depot_state) if Config.DIRECTION == 10: self.train_final_action, self.pred_final_action, self.base_line_est, self.logits = Wyatt_Model(self.batch_size, self.state, self.raw_state) self.probs = self.logits self.probs = self.probs + tf.to_float(tf.less(self.probs, -.8*Config.LOGIT_PENALTY))*Config.LOGIT_PENALTY self.probs = tf.clip_by_value(tf.nn.softmax(self.probs), 1e-7, 1e7) gather_ind = tf.concat([ tf.reshape(tf.tile(tf.reshape(tf.range(0, self.batch_size), [-1, 1]), [1, Config.NUM_OF_CUSTOMERS]), [-1, 1]), tf.tile(tf.reshape(tf.range(0, Config.NUM_OF_CUSTOMERS), [-1, 1]), [self.batch_size, 1]), tf.reshape(self.pred_final_action, [-1, 1])], axis=1) self.new_probs_with_pi = tf.reshape(tf.gather_nd(self.probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS]) self.old_probs_with_pi = tf.reshape(tf.gather_nd(self.old_probs, gather_ind), [self.batch_size, Config.NUM_OF_CUSTOMERS]) self.ratio = tf.divide(self.new_probs_with_pi, self.old_probs_with_pi) if DECODER_TYPE == 0: # x = tf.range(0, 19, dtype=tf.int32) # x = [tf.random_shuffle(x)] # for i in range(499): # y = tf.range(0, 19, dtype=tf.int32) # y = [tf.random_shuffle(y)] # x = tf.concat((x, y), axis=0) # self.pred_final_action = x[:self.batch_size, :] if Config.SEQUENCE_COST == 0: self.critic_loss = tf.losses.mean_squared_error(self.sampled_cost, self.base_line_est) else: self.critic_loss = tf.losses.mean_squared_error(tf.reshape(self.sampled_cost[:, 0], [-1, 1]), self.base_line_est) if Config.LOGIT_CLIP_SCALAR != 0: self.logits = Config.LOGIT_CLIP_SCALAR*tf.nn.tanh(self.logits) if Config.REINFORCE == 0: # self.weights = tf.to_float(tf.tile(tf.reshape(tf.range( # 1, tf.divide(1, tf.shape(self.state)[1]), -tf.divide(1, tf.shape(self.state)[1])), # [1, -1]), [self.batch_size, 1])) self.actor_loss = tf.contrib.seq2seq.sequence_loss( logits=self.logits, targets=self.or_route[:, :-1], weights=tf.ones([self.batch_size, tf.shape(self.state)[1]]) # weights=self.weights ) else: self.neg_log_prob = -1*tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.train_final_action) self.R = tf.stop_gradient(self.sampled_cost) if Config.SEQUENCE_COST == 1 and Config.USE_PPO == 0: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline self.actor_loss = tf.reduce_mean(tf.multiply(self.neg_log_prob, self.R-V)) elif Config.USE_PPO == 1: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R, axis=0)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline adv = self.R - V epsilon = 0.1 self.actor_loss = -tf.reduce_mean(tf.reduce_sum( tf.minimum(tf.multiply(self.ratio, adv), tf.clip_by_value(self.ratio, 1.0-epsilon, 1.0+epsilon)*adv), axis=1)) elif Config.MOVING_AVERAGE == 1: assign = tf.assign(self.MA_baseline, self.MA_baseline*.999 + tf.reduce_mean(self.R)*.001) with tf.control_dependencies([assign]): V = self.MA_baseline self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V)) elif Config.USE_OR_COST == 1: V = tf.stop_gradient(self.or_cost) self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), (self.R-V)/5)) else: V = tf.stop_gradient(self.base_line_est) self.actor_loss = tf.reduce_mean(tf.multiply(tf.reduce_sum(self.neg_log_prob, axis=1), self.R-V)) with tf.name_scope("Train"): if Config.GPU == 1: colocate = True else: colocate = False if Config.LR_DECAY_OFF == 0: self.lr = tf.train.exponential_decay( Config.LEARNING_RATE, self.global_step, 200000, .9, staircase=True, name="learning_rate") else: self.lr = Config.LEARNING_RATE self.train_critic_op = tf.train.AdamOptimizer(self.lr).minimize(self.critic_loss) if Config.MAX_GRAD != 0: self.params = tf.trainable_variables() self.gradients = tf.gradients(self.actor_loss, self.params, colocate_gradients_with_ops=colocate) opt = tf.train.AdamOptimizer(self.lr) self.clipped_gradients, gradient_norm = tf.clip_by_global_norm(self.gradients, Config.MAX_GRAD) self.train_actor_op = opt.apply_gradients(zip(self.clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar("grad_norm", gradient_norm) tf.summary.scalar("LearningRate", self.lr) else: self.train_actor_op = tf.train.AdamOptimizer(self.lr).minimize(self.actor_loss, global_step=self.global_step, colocate_gradients_with_ops=colocate) # # for gradient clipping https://github.com/tensorflow/nmt/blob/master/nmt/model.py with tf.name_scope("Loss"): tf.summary.scalar("Loss", self.actor_loss) tf.summary.scalar("Critic_Loss", self.critic_loss) with tf.name_scope("Performace"): tf.summary.scalar("Relative Critic Loss", tf.reduce_mean(self.base_line_est/self.or_cost)) tf.summary.scalar("Relative Critic Loss to Sampled", tf.reduce_mean(self.base_line_est/self.sampled_cost)) tf.summary.scalar("difference_in_length", self.difference_in_length) tf.summary.scalar("relative_length", self.relative_length) tf.summary.scalar("Avg_or_cost", tf.reduce_mean(self.or_cost)) if Config.SEQUENCE_COST == 0: tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost)) else: tf.summary.scalar("Avg_sampled_cost", tf.reduce_mean(self.sampled_cost[:, 0])) # tf.summary.histogram("LocationStartDist", tf.transpose(self.pred_final_action, [1, 0])[0]) # tf.summary.histogram("LocationEndDist", tf.transpose(self.pred_final_action, [1, 0])[-1]) with tf.name_scope("Config"): tf.summary.scalar("REINFORCE", Config.REINFORCE) tf.summary.scalar("DIRECTION", Config.DIRECTION) tf.summary.scalar("NUM_OF_CUSTOMERS", Config.NUM_OF_CUSTOMERS) tf.summary.scalar("StateEmbed", tf.cast(Config.STATE_EMBED, tf.int32)) tf.summary.scalar("MAX_GRAD", Config.MAX_GRAD) tf.summary.scalar("LogitPen", Config.LOGIT_PENALTY) tf.summary.scalar("batch_size", self.batch_size) tf.summary.scalar("Config.LAYERS_STACKED_COUNT", Config.LAYERS_STACKED_COUNT) tf.summary.scalar("RNN_HIDDEN_DIM", Config.RNN_HIDDEN_DIM) tf.summary.scalar("RUN_TIME", Config.RUN_TIME) tf.summary.scalar("LOGIT_CLIP_SCALAR", Config.LOGIT_CLIP_SCALAR) tf.summary.scalar("Droput", tf.cast(Config.DROPOUT, tf.int32)) tf.summary.scalar("GPU", Config.GPU)
def __init__(self, **kwargs): dataset_folder = Path(kwargs["dataset_folder"]).resolve() check_valid_path(dataset_folder) result_folder = kwargs["result_folder"] self.initial_epoch = 1 self.test_mode = kwargs["test"] self.epochs = kwargs["epochs"] self.use_label_smoothing = kwargs["label_smoothing"] self.ckpt_path = kwargs["ckpt_path"] self.ckpt_epoch = kwargs["ckpt_epoch"] # model에 필요한 폴더 및 파일 생성 self.log_folder, self.ckpt_folder, self.image_folder = create_folder( result_folder) if not self.test_mode: self.training_result_file = self.log_folder / "training_result.txt" self.test_result_file = None # kwargs 값 저장 msg = "" for k, v in list(kwargs.items()): msg += "{} = {}\n".format(k, v) msg += "new model checkpoint path = {}\n".format(self.ckpt_folder) with (self.log_folder / "model_settings.txt").open( "w", encoding="utf-8") as fp: fp.write(msg) # 필요한 data를 불러옴 self.src_word2id, self.src_id2word, self.src_vocab_size = load_word_dic( dataset_folder / "src_word2id.pkl") self.tar_word2id, self.tar_id2word, self.tar_vocab_size = load_word_dic( dataset_folder / "tar_word2id.pkl") if not self.test_mode: train_src, num_train_src = get_dataset( self.src_word2id, dataset_folder / "train_src.txt", False, True, True) train_tar, num_train_tar = get_dataset( self.tar_word2id, dataset_folder / "train_tar.txt", True, True, True) if num_train_src != num_train_tar: raise Exception( "source 데이터셋({})과 target 데이터셋({})의 크기가 다릅니다.".format( num_train_src, num_train_tar)) self.num_train = num_train_src self.train_dataset = tf.data.Dataset.from_generator( lambda: zip(train_src, train_tar), (tf.int32, tf.int32)) self.train_dataset = self.train_dataset.cache().shuffle( self.num_train + 1).padded_batch( batch_size=kwargs["batch_size"], padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None])), padding_values=(self.src_word2id["<PAD>"], self.tar_word2id["<PAD>"])).prefetch(1) test_src_path = dataset_folder / "test.txt" if test_src_path.exists(): test_src, self.num_test = get_dataset(self.src_word2id, test_src_path, False, True, False) # self.test_src_max_len = max([len(sentence) for sentence in test_src]) # padded_test_src = tf.keras.preprocessing.sequence.pad_sequences( # test_src, maxlen = self.test_src_max_len, padding = 'post', # dtype = 'int32', value = self.src_word2id["<PAD>"]) self.test_dataset = tf.data.Dataset.from_generator( lambda: test_src, tf.int32) self.test_dataset = self.test_dataset.cache().batch(1).prefetch(1) self.test_result_file = self.log_folder / "test_result.txt" elif self.test_mode: raise FileNotFoundError( "[ {} ] 경로가 존재하지 않습니다.".format(test_src_path)) self.encoder = Encoder(self.src_vocab_size, kwargs["embedding_size"], kwargs["hidden_size"], kwargs["dropout_rate"], kwargs["gru"], kwargs["bi"]) self.decoder = Decoder(self.tar_vocab_size, kwargs["embedding_size"], kwargs["hidden_size"], kwargs["attention_size"], kwargs["dropout_rate"], kwargs["gru"], kwargs["bi"]) # 아래 line 6줄은 colab에서 한글 깨짐을 방지하기 위한 부분으로 생략해도 됩니다. # %config InlineBackend.figure_format = 'retina' # !apt -qq -y install fonts-nanum fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) plt.rc('font', family='NanumBarunGothic') mpl.font_manager._rebuild()
class Trainer: def __init__(self, dataset, args): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.dataset = dataset self.encoder = ConvEncoder(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device) self.decoder = Decoder(self.dataset.num_ent(), self.dataset.num_rel() , self.device) self.discriminator = ComplEx(self.dataset.num_ent(), self.dataset.num_rel(), args.emb_dim, self.device) self.args = args self.adversarial_loss = nn.BCEWithLogitsLoss() self.reconstruction_loss = nn.BCELoss() def train(self): self.encoder.train() self.decoder.train() self.discriminator.train() print('entity', self.dataset.num_ent(), 'relation', self.dataset.num_rel()) print('ConvEncoder') print('train_simple1') print('epoch', self.args.ne) print('D_lr',self.args.D_lr) print('G_lr',self.args.G_lr) print('emb_dim',self.args.emb_dim) print('batch_size',self.args.batch_size) print('discriminator_range',self.args.discriminator_range) entity_onehot = [] relation_onehot = [] for i in range(self.dataset.num_ent()): onehot = [0 for x in range(self.dataset.num_ent())] onehot[i] = 1 entity_onehot.append(onehot) for i in range(self.dataset.num_rel()): onehot = [0 for x in range(self.dataset.num_rel())] onehot[i] = 1 relation_onehot.append(onehot) #********************************admgrad********************************************************************* optimizer_D = torch.optim.Adagrad( self.discriminator.parameters(), lr = self.args.D_lr, weight_decay= 0, initial_accumulator_value= 0.1) optimizer_Encoder = torch.optim.Adagrad( self.encoder.parameters(), lr = self.args.G_lr, weight_decay= 0, initial_accumulator_value= 0.1) optimizer_Decoder = torch.optim.Adagrad( self.decoder.parameters(), lr = self.args.G_lr, weight_decay= 0, initial_accumulator_value= 0.1) for epoch in range(1, self.args.ne+1): # start_time = time.time() last_batch = False total_d_loss = 0.0 total_g_loss = 0.0 while not last_batch: pos_batch = self.dataset.next_pos_batch(self.args.batch_size) last_batch = self.dataset.was_last_batch() h_onehot = [] r_onehot = [] t_onehot = [] for i in pos_batch[:,0]: one_hot = entity_onehot[i] h_onehot.append(one_hot) for i in pos_batch[:,2]: one_hot = entity_onehot[i] t_onehot.append(one_hot) for i in pos_batch[:,1]: one_hot = relation_onehot[i] r_onehot.append(one_hot) h = torch.tensor(h_onehot).float().to(self.device) r = torch.tensor(r_onehot).float().to(self.device) t = torch.tensor(t_onehot).float().to(self.device) # ----------------- # Train Generator # ---------------- optimizer_Encoder.zero_grad() optimizer_Decoder.zero_grad() encoder_batch = np.repeat(np.copy(pos_batch), 1, axis=0) for i in range(self.args.batch_size): if np.random.random()<0.5: encoder_batch[i][0] = pos_batch[i][0] encoder_batch[i][1] = 0 encoder_batch[i][2] = pos_batch[i][1] else: encoder_batch[i][0] = pos_batch[i][2] encoder_batch[i][1] = 1 encoder_batch[i][2] = pos_batch[i][1] encoder_h_onehot = [] encoder_r_onehot = [] encoder_position = [] for i in encoder_batch[:,0]: one_hot = entity_onehot[i] encoder_h_onehot.append(one_hot) for i in encoder_batch[:,1]: encoder_position.append([i]) for i in encoder_batch[:,2]: one_hot = relation_onehot[i] encoder_r_onehot.append(one_hot) encoder_h = torch.tensor(encoder_h_onehot).float().to(self.device) encoder_p = torch.tensor(encoder_position).float().to(self.device) encoder_r = torch.tensor(encoder_r_onehot).float().to(self.device) fake_tails =self.encoder(encoder_h, encoder_p, encoder_r) construction_heads, construction_postions, construction_rels = self.decoder(fake_tails) g_loss = self.reconstruction_loss(construction_heads, encoder_h) + self.reconstruction_loss(construction_rels, encoder_r) + self.reconstruction_loss(construction_postions, encoder_p) g_loss.backward() total_g_loss += g_loss.cpu().item() optimizer_Encoder.step() optimizer_Decoder.step() neg_batch = np.repeat(np.copy(pos_batch), self.args.neg_ratio, axis=0) for _ in range(self.args.discriminator_range): neg_entity = [] for i in range(len(neg_batch)): if np.random.random() < 0.5: temp = [] temp_h = pos_batch[i][0] temp_p = [0] temp_r = pos_batch[i][1] temp.append(temp_h) temp.append(temp_p) temp.append(temp_r) neg_entity.append(temp) else: temp = [] temp_h = pos_batch[i][2] temp_p = [1] temp_r = pos_batch[i][1] temp.append(temp_h) temp.append(temp_p) temp.append(temp_r) neg_entity.append(temp) temp_h_one_hot = [] temp_r_one_hot = [] temp_p = [] for ele in neg_entity: temp_h_one_hot.append(entity_onehot[ele[0]]) temp_r_one_hot.append(relation_onehot[ele[2]]) temp_p.append(ele[1]) temp_h_one_hot = torch.tensor(temp_h_one_hot).float().to(self.device) temp_r_one_hot = torch.tensor(temp_r_one_hot).float().to(self.device) temp_p = torch.tensor(temp_p).float().to(self.device) neg_tails_index = np.argmax(self.encoder(temp_h_one_hot, temp_p, temp_r_one_hot).cpu().data.numpy(), axis=1) for i in range(len(neg_batch)): if neg_entity[i][1] == [0]: neg_batch[i][2] = neg_tails_index[i] elif neg_entity[i][1] == [1]: neg_batch[i][0] = neg_tails_index[i] else: print('GG') neg_batch[:,-1] = -1 batch = np.append(pos_batch, neg_batch, axis=0) np.random.shuffle(batch) full_h_onehot = [] full_r_onehot = [] full_t_onehot = [] for i in batch[:,0]: one_hot = entity_onehot[i] full_h_onehot.append(one_hot) for i in batch[:,2]: one_hot = entity_onehot[i] full_t_onehot.append(one_hot) for i in batch[:,1]: one_hot = relation_onehot[i] full_r_onehot.append(one_hot) full_h = torch.tensor(full_h_onehot).float().to(self.device) full_r = torch.tensor(full_r_onehot).float().to(self.device) full_t = torch.tensor(full_t_onehot).float().to(self.device) labels = torch.tensor(batch[:,3]).float().to(self.device) optimizer_D.zero_grad() scores = self.discriminator(full_h, full_r, full_t) d_loss = torch.sum(F.softplus(-labels * scores)) + (self.args.reg_lambda * self.discriminator.l2_loss() / self.dataset.num_batch(self.args.batch_size)) d_loss.backward() optimizer_D.step() for p in self.discriminator.parameters(): p.data.clamp_(-1, 1) total_d_loss += d_loss.cpu().item() # =================== generator training ======================= optimizer_Encoder.zero_grad() fake_tails =self.encoder(encoder_h, encoder_p, encoder_r) generator_score = self.discriminator(encoder_h, encoder_r, fake_tails) G_loss = -0.2 * torch.mean(torch.log(generator_score + 1e-6)) G_loss.backward() optimizer_Encoder.step() # finish_time = time.time() # with open("train_time_log.log",'a') as f: # f.write(str(epoch)+" "+str(start_time)+" "+str(finish_time)+"\n") print("Loss in iteration " + str(epoch) + ": " + str(total_d_loss) + "(" + self.dataset.name + ")") print("Loss in iteration " + str(epoch) + ": " + str(total_g_loss) + "(" + self.dataset.name + ")") if epoch % self.args.save_each == 0: self.save_model(epoch) if epoch % 25 == 0: print('epoch', epoch, scores) print('neg_batch', neg_batch[:,2]) def save_model(self, chkpnt): print("Saving the model") directory = "models/" + self.dataset.name + "/" + 'complex' + "/" if not os.path.exists(directory): os.makedirs(directory) torch.save(self.discriminator, directory + str(chkpnt) + ".chkpnt")
def main(args, encoder=None, decoder=None): #Model dir if not os.path.exists(args.model_path): os.makedirs(args.model_path) print("[INFO]Loading data") print("[INFO] Reading file:{}".format(args.vocab_path)) #Get dataloader batch_size, shuffle, num_workers dataloader = get_loader( args.vocab_path, #'captions2.json', vocab=None, max_len=18, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) print("[INFO]Creating models") #Models if encoder is None and decoder is None: encoder = EncoderCNN() decoder = Decoder(decoder_size=18) else: encoder = encoder.train() decoder = decoder.train() #Loss and optimiser loss_func = nn.CrossEntropyLoss() #params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) params = list(decoder.parameters()) + list(encoder.parameters()) optimiser = torch.optim.Adam(params, lr=args.learning_rate) print("[INFO] Starting training loop") #Train the models start = time.time() savedmodel = False total_step = len(dataloader) for epoch in range(args.num_epochs): print("Epoch:{}/{}".format(epoch, args.num_epochs)) prev_loss = 0 for i, (images, captions, lengths) in enumerate(dataloader): #Feed forward, backwards and optimise features = encoder(images) features = features.long() outputs = decoder(features, captions, tgt_mask=None) loss = loss_func(outputs, captions) decoder.zero_grad() encoder.zero_grad() loss.backward() optimiser.step() if loss == 0.000: prev_loss = prev_loss + 1 if prev_loss == 5: print("Epoch: {}/{}--i:{}".format(epoch, args.num_epochs, i)) print("Loss: {:.4f}".format(loss.item())) save_models(decoder, encoder) exit() if i % args.log_step == 0: print("Epoch: {}/{}--i:{}".format(epoch, args.num_epochs, i)) print("Loss: {:.4f}".format(loss.item())) print("[INFO] Time elapsed: {}".format(time.time() - start)) #used `savedmodel` to save model once during epoch if (i + 1) % args.save_step == 0 and savedmodel is False: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) print("[INFO] Time elapsed: {}".format(time.time() - start)) print("[INFO] Exiting")
parser.add_argument('--num_epochs', type=int, default=10, help='Total epochs') parser.add_argument('--batch_size', type=int, default=1, help='dataloader batch size') parser.add_argument('--num_workers', type=int, default=0, help='num of workers for model dataloader') parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate for models') args = parser.parse_args() if args.encoder_path is None and args.decoder_path is None: main(args) else: print("[INFO] Creating and Loading models") #Models encoder = EncoderCNN() decoder = Decoder(decoder_size=18) #Load trained models encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) main(args, encoder, decoder)
Ps = args2Ps(args) #------------------------------------------------------------------------- # Vocabulary #------------------------------------------------------------------------- vocab = Vocabulary() vocab.make(dataset="flickr8k", min_word_freq=5) #------------------------------------------------------------------------- # models #------------------------------------------------------------------------- encoder = Encoder() encoder.fine_tune(Ps["fine_tune_encoder"]) decoder = Decoder(attention_dim = Ps["attention_dim"], embed_dim = Ps["embed_dim"], decoder_dim = Ps["decoder_dim"], encoder_dim = encoder.encoder_dim, vocab_size = len(vocab), device = Ps["device"], dropout = Ps["dropout"] ) encoder = encoder.to(Ps["device"]) decoder = decoder.to(Ps["device"]) # whether to load a saved state_dict from checkpoint file if Ps["parent"] is not None: pass #------------------------------------------------------------------------- # optimizer and scheduler #------------------------------------------------------------------------- optimizer = get_optimizer(Ps, encoder, decoder) scheduler = Scheduler(optimizer, [None,None]) #-------------------------------------------------------------------------
def train(train_dataset, validation_dataset=None, iterations=150, hidden_size=64, batch_size=16): print("Training...") train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) validation = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate) encoder = Encoder(1, hidden_size).to(device) decoder = Decoder(hidden_size, 1).to(device) encoder_optimizer = optim.Adam(encoder.parameters()) decoder_optimizer = optim.Adam(decoder.parameters()) criterion = nn.MSELoss() train_losses = [] validation_losses = [] for iter in range(iterations): encoder.train() decoder.train() loss_acc = 0 for input_tensor, target_tensor, _, max_len, lens in train: _, encoder_hidden = encoder(input_tensor, None) decoder_hidden = encoder_hidden decoder_input = target_tensor[:, 0].view(batch_size, 1, 1) outputs = torch.zeros(batch_size, max_len) for di in range(1, max_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) outputs[:, di] = decoder_output.view(batch_size) decoder_input = decoder_output.detach() for i in range(len(lens)): outputs[i, lens[i]:] = 0 """ if iter == iterations-1: print(target_tensor[:,1:].squeeze()) print(outputs[:,1:].squeeze()) print() """ encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() batch_loss = criterion(outputs[:, 1:].squeeze(), target_tensor[:, 1:].squeeze()) batch_loss.backward(retain_graph=True) loss_acc += batch_loss.item() encoder_optimizer.step() decoder_optimizer.step() train_losses.append(loss_acc) with torch.no_grad(): val_loss_acc = 0 for input_tensor, target_tensor, _, max_len, lens in validation: val_batch_size = len(target_tensor) _, encoder_hidden = encoder(input_tensor) decoder_hidden = encoder_hidden decoder_input = target_tensor[:, 0].view(val_batch_size, 1, 1) decoder_hidden = encoder_hidden outputs = torch.zeros(val_batch_size, max_len) for di in range(1, max_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) outputs[:, di] = decoder_output.view(val_batch_size) decoder_input = decoder_output for i in range(len(lens)): outputs[i, lens[i]:] = 0 val_loss = criterion(outputs[:, 1:].squeeze(), target_tensor[:, 1:].squeeze()) val_loss_acc += val_loss.item() validation_losses.append(val_loss_acc) if iter % 1 == 0: print("Iteration:", iter, " Train loss: ", "{0:.5f}".format(loss_acc / len(train)), " Validation loss: ", "{0:.5f}".format(validation_losses[-1])) showPlot(train_losses, validation_losses) torch.save(encoder, "models/encoder.pt") torch.save(decoder, "models/decoder.pt")
def make_std_mask(tgt, pad): "创建一个mask来隐藏填充和将来的单词" tgt_mask = (tgt != pad).unsqueeze(-2) tgt_mask = tgt_mask & Variable( Decoder.subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)) return tgt_mask