def rerun_layers(self, output, update_hidden=True): for layer in range(self.decomposed_layer_number, self.model.nlayers): output, _ = getattr(self.model, self.model.rnn_module_name(layer))( output, self.hidden[layer]) if update_hidden: self.hidden[layer] = _ return model.decoder(output)
def train(): # few things that we have define batch_size = 32 train = True transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) iteration = 3 vocabulary_threshold = 5 embed_size = 512 hidden_size = 512 hidden_layer = 1 model_save = "model_storage/" # calling the dataloader train_dataLoader = get_data_loader(vocabulary_threshold, train, batch_size, transform_train) enc = encoder(embed_size, batch_size) dec = decoder(len(train_dataLoader.dataset.vocab.word_to_index), embed_size, hidden_layer, hidden_size) params = list(enc.dense.parameters()) + list(dec.parameters()) criteria = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08) steps_per_epoch = int( np.math.ceil(len(train_dataLoader.dataset.caption_len) / batch_size)) for epoch in range(iteration): for step in range(steps_per_epoch): index = train_dataLoader.dataset.trainIndices(batch_size) sampler = torch.utils.data.SubsetRandomSampler(index) train_dataLoader.batch_sampler.sampler = sampler img, caption = next(iter(train_dataLoader)) enc.zero_grad() dec.zero_grad() features = enc(img) prediction = dec(features, caption) loss = criteria( prediction.view(caption.size(0) * caption.size(1), -1), caption.view(-1)) loss.backward() optimizer.step() stats = "[%d/%d] LOSS: %.4f, PERPLEXITY: %5.4f " % ( step, iteration, loss.item(), np.exp(loss.item())) print("\r " + stats, end="") sys.stdout.flush() if step % 1000 == 0 and step != 0: # here we save the weights torch.save({"model_state": enc.state_dict()}, model_save + "encoder_" + str(step) + ".pth") torch.save({"model_state": dec.state_dict()}, model_save + "decoder_" + str(step) + ".pth") print("\r" + stats)
def evaluate(data_source, batch_size=10, window=args.window): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) next_word_history = None pointer_history = None for i in range(0, data_source.size(0) - 1, args.bptt): if i > 0: print(i, len(data_source), math.exp(total_loss / i)) data, targets, _ = get_batch(data_source, i, evaluation=True, args=args) output, hidden, rnn_outs, _ = model(data, hidden, return_h=True) output = model.decoder(output) rnn_out = rnn_outs[-1].squeeze() print(output.size()) output_flat = output.view(-1, ntokens) ### # Fill pointer history start_idx = len(next_word_history) if next_word_history is not None else 0 next_word_history = torch.cat([one_hot(t.data[0], ntokens) for t in targets]) if next_word_history is None else torch.cat([next_word_history, torch.cat([one_hot(t.data[0], ntokens) for t in targets])]) #print(next_word_history) pointer_history = Variable(rnn_out.data) if pointer_history is None else torch.cat([pointer_history, Variable(rnn_out.data)], dim=0) #print(pointer_history) ### # Built-in cross entropy # total_loss += len(data) * criterion(output_flat, targets).data[0] ### # Manual cross entropy # softmax_output_flat = torch.nn.functional.softmax(output_flat) # soft = torch.gather(softmax_output_flat, dim=1, index=targets.view(-1, 1)) # entropy = -torch.log(soft) # total_loss += len(data) * entropy.mean().data[0] ### # Pointer manual cross entropy loss = 0 softmax_output_flat = torch.nn.functional.softmax(output_flat) for idx, vocab_loss in enumerate(softmax_output_flat): p = vocab_loss if start_idx + idx > window: valid_next_word = next_word_history[start_idx + idx - window:start_idx + idx] valid_pointer_history = pointer_history[start_idx + idx - window:start_idx + idx] logits = torch.mv(valid_pointer_history, rnn_out[idx]) theta = args.theta ptr_attn = torch.nn.functional.softmax(theta * logits).view(-1, 1) ptr_dist = (ptr_attn.expand_as(valid_next_word) * valid_next_word).sum(0).squeeze() lambdah = args.lambdasm p = lambdah * ptr_dist + (1 - lambdah) * vocab_loss ### target_loss = p[targets[idx].data] loss += (-torch.log(target_loss)).data[0] total_loss += loss / batch_size ### hidden = repackage_hidden(hidden) next_word_history = next_word_history[-window:] pointer_history = pointer_history[-window:] return total_loss / len(data_source)
def test(): embed_size = 512 hidden_size = 512 weights = "model_storage/" weight_list = os.listdir(weights) selectedWeight = None index_to_word = readVocab() maxVal = 0 transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) for weight in weight_list: if "encoder" in weight: val = int(weight.split(".")[0].split("_")[1]) if val > maxVal: selectedWeight = weight maxVal = val encoder_weight = selectedWeight decoder_weight = selectedWeight.replace("encoder", "decoder") enc_weight = torch.load(weights + encoder_weight) dec_weight = torch.load(weights + decoder_weight) enc = encoder(embed_size, batch_size=1) enc.eval() enc.load_state_dict(enc_weight["model_state"]) dec = decoder(len(index_to_word), embed_size, 1, hidden_size) dec.eval() dec.load_state_dict(dec_weight["model_state"]) test_loader = get_data_loader(5, False, 1, transform_train) img_test, original_img = next(iter(test_loader)) features = enc(img_test) output = dec.sample(features.unsqueeze(1), 20) sentence = "" for val in output: if val != 0 and val != 1 and val != 2: sentence += index_to_word[val] + " " plt.imshow(np.uint8(original_img.squeeze(0).numpy())) plt.text(100, 400, sentence, style='italic', bbox={ 'facecolor': 'red', 'alpha': 0.5, 'pad': 10 }) plt.show()
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets, _ = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output = model.decoder(output) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def evaluate(split, verbose=False, n_batches=None): # Recall model is a class that inherits nn.Module that we learned in the class. # This puts the model in eval mode as opposed to train mode, so it knows which one to use. model.encoder.eval() model.decoder.eval() # Initialize cumulative loss and the number of correctly predicted examples. loss = 0 correct = 0 n_examples = 0 # Load the correct dataset between validation. if split == 'val': loader = val_loader # For each batch in the loaded dataset, with torch.no_grad(): for batch_i, batch in enumerate(loader): data, caption, lengths = batch[0], batch[1], batch[2] targets = pack_padded_sequence(caption, lengths, batch_first=True)[0] # Load the current training example in the CUDA core if available. if args.cuda: data, caption = data.cuda(), caption.cuda() # Read images and their target labels in the current batch. data, caption = Variable(data), Variable(caption) # Measure the output results given the data. features = model.encoder(data) output = model.decoder(features, caption, lengths) # Accumulate the loss by comparing the predicted output and the true targets ( both are in pack padded sequence). loss += criterion(output, targets).data # Skip the rest of evaluation if the number of batches exceed the n_batches. if n_batches and (batch_i >= n_batches): break # Compute the average loss per example. loss /= (batch_i + 1) # If verbose is True, then print out the average loss and accuracy. if verbose: print('\n{} set: Average loss: {:.4f}'.format(split, loss)) return loss
def evaluate(data_source, batch_size=10, test=False): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 total_oe_loss = 0 num_batches = 0 ntokens = len(corpus.dictionary) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) data_oe, _ = get_batch(oe_val_dataset, i, args, evaluation=True) if len(data.size()) == 1: # happens for test set? data.unsqueeze(-1) data_oe.unsqueeze(-1) if data.size(0) != data_oe.size(0): continue bs = test_batch_size if test else eval_batch_size hidden = model.init_hidden(2 * bs) hidden = repackage_hidden(hidden) output, hidden, rnn_hs, dropped_rnn_hs = model(torch.cat( [data, data_oe], dim=1), hidden, return_h=True) output, output_oe = torch.chunk(dropped_rnn_hs[-1], dim=1, chunks=2) output, output_oe = output.contiguous(), output_oe.contiguous() output = output.view(output.size(0) * output.size(1), output.size(2)) loss = criterion(model.decoder.weight, model.decoder.bias, output, targets).data # OE loss logits_oe = model.decoder(output_oe) smaxes_oe = F.softmax(logits_oe - torch.max(logits_oe, dim=-1, keepdim=True)[0], dim=-1) loss_oe = -smaxes_oe.log().mean(-1) loss_oe = loss_oe.mean().data # total_loss += loss total_oe_loss += loss_oe num_batches += 1 return total_loss[0] / num_batches, total_oe_loss[0] / num_batches
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) logits = model.decoder(output) # logProba = nn.functional.log_softmax(logits, dim=1) # pred_idxs = torch.argmax(logProba, dim=1) total_loss += len(data) * criterion( model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def run(self): batch_size = self.config['batch_size'] learning_rate = self.config['learning_rate'] # Create Model self.encoder = model.encoder().cuda() self.decoder = model.decoder().cuda() self.logger.debug('Encoder Architecture') summary(self.encoder, (3, 224, 224), batch_size=batch_size) self.logger.debug('Decoder Architecture') summary(self.decoder, (512, 14, 14), batch_size=batch_size) model_params = [] model_params += self.encoder.parameters() model_params += self.decoder.parameters() self.optm = torch.optim.SGD(model_params, lr=learning_rate, momentum=self.config['momentum'], weight_decay=self.config['weight_decay']) # Restore Model if not self.args.restart: self.load_checkpoint() # Setup Global Train Index self.gidx = self.epoch * len(self.dataset_train) # Initial Validation self.valid = DataObject() self.run_valid() total_epochs = self.config['epochs'] for _ in range(self.epoch, total_epochs): utils.adjust_learning_rate(learning_rate, self.optm, self.epoch) self.train = DataObject() self.run_train() self.valid = DataObject() self.run_valid() self.epoch += 1
def main(batch_size, train_df, trainLoader, embedding_dim, hidden_size, hidden_layer, index_to_word): vocab_size = len(index_to_word)+1 enc = encoder(embedding_dim, batch_size) dec = decoder(vocab_size, embedding_dim, hidden_layer, hidden_size) iteration = 10 #loss param = list(enc.dense.parameters()) + list(dec.parameters()) criteria = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(param, lr=0.001) total_steps = int(np.ceil(train_size/ batch_size)) caption_len = captionLength(train_df) for epoch in range(iteration): total_loss = 0.0 for step in range(total_steps): train_indices, _ = randomSelect(caption_len, batch_size) new_sampler = torch.utils.data.SubsetRandomSampler(train_indices) trainLoader.batch_sampler.sampler = new_sampler data = next(iter(trainLoader)) original_img, caption = data enc.zero_grad() dec.zero_grad() features = enc(original_img) prediction = dec(features.long(), caption) #loss loss = criteria(prediction.view(caption.size(0)*caption.size(1),-1), caption.view(-1)) loss.backward() optimizer.step() stats = "[%d/%d] Loss: %.4f, Perplexity: %5.4f "%(step, iteration, loss.item(), np.exp(loss.item())) print("\r" +stats, end="") sys.stdout.flush() total_loss += loss.item() if step % 100 ==0 and step != 0: torch.save({ 'epoch': epoch, 'model_state_dict': enc.state_dict(), 'loss': total_loss/100, }, "loss_folder/encoder_"+str(epoch)+".pth") torch.save({ 'model_state_dict':dec.state_dict() },"loss_folder/decoder_"+str(epoch)+".pth") total_loss = 0.0 print("\r" + stats)
def __init__(self, n_dim=2, batch_size=100, epochs=10, log_freq=100, results_path='./results', make_gif=False): self.n_dim = n_dim self.batch_size = batch_size self.epochs = epochs self.log_freq = log_freq self.results_path = results_path self.results_img_path = results_path + "/imges" self.make_gif = make_gif if not os.path.exists(self.results_img_path): os.makedirs(self.results_img_path) if self.make_gif and not os.path.exists(self.results_path + "/gif"): os.makedirs(self.results_path + "/gif") # data load self.load_data() self.dataset_train = tf.data.Dataset.from_tensor_slices( (self.x_train, self.y_train)) self.dtrain_shuffle = self.dataset_train.shuffle( self.x_train.shape[0]).batch(self.batch_size) self.dataset_test = tf.data.Dataset.from_tensor_slices( (self.x_test, self.y_test)) self.dtest_shuffle = self.dataset_test.shuffle( self.x_test.shape[0]).batch(1000) # Models self.encoder = encoder(n_dim=self.n_dim) self.decoder = decoder() self.discriminator = discriminator() # optimizer self.ae_opt = tf.keras.optimizers.Adam(0.0001) self.gen_opt = tf.keras.optimizers.Adam(0.0001, beta_1=0, beta_2=0.9) self.disc_opt = tf.keras.optimizers.Adam(0.0001, beta_1=0, beta_2=0.9) self.loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def play(text, batch_size=1): model.eval() text = text.lower() text = re.sub('\d+', 'N', text) punc = string.punctuation.replace(".", "’—“”") punc = punc.replace("'", "") text = text.translate(str.maketrans('', '', punc)) text = text.replace("n't", " n't") text = text.replace("'s", " 's") text = text.replace("'ve", " 've") text = text.replace("'d", " 'd") text = text.replace("'ll", " 'll") data = new_tokenize(text).unsqueeze(1).cuda() hidden = model.init_hidden(batch_size) output, hidden = model(data, hidden) logits = model.decoder(output) logProba = nn.functional.log_softmax(logits, dim=1) pred_idxs = torch.argmax(logProba, dim=1) preds = [corpus.dictionary.idx2word[idx] for idx in pred_idxs] next_word = preds[-1] return next_word
def evaluate(data_source, corpus, batch_size=10, ood=False): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() loss_accum = 0 losses = [] ntokens = len(corpus.dictionary) for i in range(0, data_source.size(0) - 1, args.bptt): if (i >= ood_num_examples // test_batch_size) and (ood is True): break hidden = model.init_hidden(batch_size) hidden = repackage_hidden(hidden) data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) logits = model.decoder(output) smaxes = F.softmax(logits - torch.max(logits, dim=1, keepdim=True)[0], dim=1) tmp = smaxes[range(targets.size(0)), targets] log_prob = torch.log(tmp).mean( 0) # divided by seq len, so this is the negative nats per char loss = -log_prob.data.cpu().numpy()[0] loss_accum += loss # losses.append(loss) # Experimental! # anomaly_score = -torch.max(smaxes, dim=1)[0].mean() # negative MSP anomaly_score = ((smaxes).add(1e-18).log() * uniform_base_rates.unsqueeze(0)).sum(1).mean( 0) # negative KL to uniform losses.append(anomaly_score.data.cpu().numpy()[0]) # return loss_accum / (len(data_source) // args.bptt), losses
def test( batch_size, df, testLoader, index_to_word): enc = encoder(512, batch_size) enc.eval() dec = decoder(len(index_to_word)+1, 512, 1, 512) dec.eval() #load the model enc_weight = torch.load("loss_folder/encoder_2.pth") dec_weight = torch.load("loss_folder/decoder_2.pth") enc.load_state_dict(enc_weight["model_state_dict"]) dec.load_state_dict(dec_weight["model_state_dict"]) img, caption = next(iter(testLoader)) print(img.shape) caption = caption[0] features = enc(img).unsqueeze(1) output = dec.sample(features.float(), 27) sent = "" for word in output: if index_to_word.get(word) !="START" and index_to_word.get(word)!="END" and word !=0: sent += index_to_word[word]+" " print(sent) plt.imshow(img[0].permute(1,2,0).detach().numpy()) plt.show()
def main(): place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) encoder_program = fluid.Program() with fluid.program_guard(main_program=encoder_program): enc_output = encoder(ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout) decoder_program = fluid.Program() with fluid.program_guard(main_program=decoder_program): predict = decoder(ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout) # Load model parameters of encoder and decoder separately from the saved # transformer model. encoder_var_names = [] for op in encoder_program.block(0).ops: encoder_var_names += op.input_arg_names encoder_param_names = filter( lambda var_name: isinstance( encoder_program.block(0).var(var_name), fluid.framework.Parameter), encoder_var_names) encoder_params = map(encoder_program.block(0).var, encoder_param_names) decoder_var_names = [] for op in decoder_program.block(0).ops: decoder_var_names += op.input_arg_names decoder_param_names = filter( lambda var_name: isinstance( decoder_program.block(0).var(var_name), fluid.framework.Parameter), decoder_var_names) decoder_params = map(decoder_program.block(0).var, decoder_param_names) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params) # This is used here to set dropout to the test mode. encoder_program = encoder_program.clone(for_test=True) decoder_program = decoder_program.clone(for_test=True) test_data = paddle.batch(paddle.dataset.wmt16.test( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=InferTaskConfig.batch_size) trg_idx2word = paddle.dataset.wmt16.get_dict( "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True) def post_process_seq(seq, bos_idx=ModelHyperParams.bos_idx, eos_idx=ModelHyperParams.eos_idx, output_bos=InferTaskConfig.output_bos, output_eos=InferTaskConfig.output_eos): """ Post-process the beam-search decoded sequence. Truncate from the first <eos> and remove the <bos> and <eos> tokens currently. """ eos_pos = len(seq) - 1 for i, idx in enumerate(seq): if idx == eos_idx: eos_pos = i break seq = seq[:eos_pos + 1] return filter( lambda idx: (output_bos or idx != bos_idx) and \ (output_eos or idx != eos_idx), seq) for batch_id, data in enumerate(test_data()): batch_seqs, batch_scores = translate_batch( exe, [item[0] for item in data], encoder_program, encoder_data_input_fields + encoder_util_input_fields, [enc_output.name], decoder_program, decoder_data_input_fields[:-1] + decoder_util_input_fields + (decoder_data_input_fields[-1], ), [predict.name], InferTaskConfig.beam_size, InferTaskConfig.max_length, InferTaskConfig.n_best, len(data), ModelHyperParams.n_head, ModelHyperParams.d_model, ModelHyperParams.eos_idx, # Use eos_idx to pad. ModelHyperParams.eos_idx, # Use eos_idx to pad. ModelHyperParams.bos_idx, ModelHyperParams.eos_idx, ModelHyperParams.unk_idx, output_unk=InferTaskConfig.output_unk) for i in range(len(batch_seqs)): # Post-process the beam-search decoded sequences. seqs = map(post_process_seq, batch_seqs[i]) scores = batch_scores[i] for seq in seqs: print(" ".join([trg_idx2word[idx] for idx in seq]))
def main(): ## prepare data # load MNIST and MNIST-M (m_train, m_train_y), (m_test, m_test_y) = tf.keras.datasets.mnist.load_data() mm = pkl.load(open('data/mnistm_data.pkl', 'rb')) mm_train, mm_train_y = mm['train'], mm['train_label'] # # keep numbers 0-4 in MNIST as content, and numbers 5-9 in MNIST-M as style # content_image = m_train[m_train_y < 5, ...] # content_image_y = m_train_y[m_train_y < 5] # content_image = resize_image(content_image, size=(32, 32)) # content_image = np.repeat(content_image[..., np.newaxis], 3, axis=-1) # test_content_image = m_train[m_train_y >= 5, ...] # style_image = mm_train[mm_train_y >= 5, ...] # style_image_y = mm_train[mm_train_y >= 5] # style_image, style_image_y = generate_few_shot_style(style_image, style_image_y, num_sample=5) # style_image = resize_image(style_image, size=(32, 32)) # use all train data in MNIST as content image, and all train data in MNIST-M as style image content_image = resize_image(m_train, size=(32, 32)) content_image = np.repeat(content_image[..., np.newaxis], 3, axis=-1) test_content_image = m_test style_image = resize_image(mm_train, size=(32, 32)) ## prepare model # inputs placeholder c_img = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) s_img = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) # establish model c_encode, _ = encoder(c_img) s_encode, s_layers = encoder(s_img, reuse=True) c_adain_encode = adain(c_encode, s_encode) styled_img = decoder(c_adain_encode) styled_encode, styled_layers = encoder(styled_img, reuse=True) # loss content_loss = compute_content_loss(styled_encode, c_adain_encode) style_loss = compute_style_loss(styled_layers, s_layers) total_loss = content_loss + 0.01 * style_loss # optimizer optimizer = tf.train.AdamOptimizer(1e-4) train_op = optimizer.minimize(total_loss) model_summary() ## training init = tf.global_variables_initializer() with tf.Session() as sess: # Creates a file writer for the log directory. logdir = "logs/" file_writer = tf.summary.FileWriter(logdir, sess.graph) # store variables tf.summary.image("Content image", c_img, max_outputs=10) tf.summary.image("Style image", s_img, max_outputs=10) tf.summary.image("Styled image", styled_img, max_outputs=10) tf.summary.scalar("Content loss", content_loss) tf.summary.scalar("Style loss", style_loss) tf.summary.scalar("Total loss", total_loss) merged = tf.summary.merge_all() sess.run(init) # total number of data num_data = content_image.shape[0] batch_size = 8 num_batch = num_data // batch_size for i_episode in range(EPISODE): # shuffle data np.random.shuffle(content_image) np.random.shuffle(style_image) for i_batch in range(num_batch): # get a batch of content c_image = content_image[i_batch*batch_size: (i_batch+1)*batch_size, ...] c_image = c_image / 255 # random sample a batch of style idx = np.random.choice(style_image.shape[0], batch_size, replace=False) s_image = style_image[idx, ...] s_image = s_image / 255 # training _, train_loss = sess.run([train_op, total_loss], feed_dict={ c_img: c_image, s_img: s_image }) if i_batch % 100 == 0: # evaluation on test content image np.random.shuffle(test_content_image) test_c_image = test_content_image[:10, ...] test_c_image = resize_image(test_c_image, size=(32, 32)) test_c_image = np.repeat(test_c_image[..., np.newaxis], 3, axis=-1) test_c_image = test_c_image / 255 test_s_image = style_image[:10, ...] / 255 summary, test_loss = sess.run([merged, total_loss], feed_dict={ c_img: test_c_image, s_img: test_s_image }) # log all variables #num_iter = i_episode * num_batch + i_batch file_writer.add_summary(summary, global_step=i_episode * num_batch + i_batch) print('Episode: %d, batch: %d, training cost: %g, test cost: %g' % (i_episode, i_batch, train_loss, test_loss)) file_writer.close()
def train(base_rates): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 total_oe_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) batch, i = 0, 0 # indices for randomizing order of segments train_indices = np.arange(train_data.size(0) // args.bptt) np.random.shuffle(train_indices) oe_indices = np.arange(oe_dataset.size(0) // args.bptt) np.random.shuffle(oe_indices) # seq_len = args.bptt br = None for i in range( 0, train_data.size(0), args.bptt ): # Assume OE dataset is larger. It is, because we're using wikitext-2. lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) data_oe, _ = get_batch(oe_dataset, i, args, seq_len=seq_len) if data.size(0) != data_oe.size( 0 ): # Don't train on this batch if the sequence lengths are different (happens at end of epoch). continue # We need a new hidden state for each segment, because this makes evaluation easier and more meaningful. hidden = model.init_hidden(2 * args.batch_size) hidden = repackage_hidden(hidden) output, hidden, rnn_hs, dropped_rnn_hs = model(torch.cat( [data, data_oe], dim=1), hidden, return_h=True) output, output_oe = torch.chunk(dropped_rnn_hs[-1], dim=1, chunks=2) output, output_oe = output.contiguous(), output_oe.contiguous() output = output.view(output.size(0) * output.size(1), output.size(2)) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) # OE loss logits_oe = model.decoder(output_oe) smaxes_oe = F.softmax(logits_oe - torch.max(logits_oe, dim=-1, keepdim=True)[0], dim=-1) br = Variable( torch.FloatTensor(base_rates).unsqueeze(0).unsqueeze(0).expand_as( smaxes_oe)).cuda() if br is None else br loss_oe = -(smaxes_oe.log() * br).sum(-1) # for cross entropy loss_oe = loss_oe.mean() # for ERM # if args.use_OE == 'yes': loss_bp = loss + 0.5 * loss_oe else: loss_bp = loss optimizer.zero_grad() loss_bp.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm(params, args.clip) optimizer.step() total_loss += raw_loss.data total_oe_loss += loss_oe.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval cur_oe_loss = total_oe_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | oe_loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'. format(epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, cur_oe_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 total_oe_loss = 0 start_time = time.time() ### batch += 1
################ img_shape = [32, 32, 3] tf.reset_default_graph() inputs = tf.placeholder(tf.float32, shape=[None] + img_shape, name='encoder_input') inputs_norm = tf.div(tf.subtract(inputs, tf.reduce_min(inputs)), tf.subtract(tf.reduce_max(inputs), tf.reduce_min(inputs))) drop_prob = tf.placeholder_with_default(1.0, shape=()) ## ENCODER means, log_scales = model.gaussian_encoder(inputs, FLAGS.latent_size, drop_prob) # (?, 4, 4, 8) codes = model.gaussian_sample(means, log_scales) # (?, 4, 4, 8) tf.identity(codes, name='encoder_output') ## DECODER outputs = model.decoder(codes, drop_prob) tf.identity(outputs, name='decoder_output') # calculate loss with learnable parameter for output log_scale with tf.name_scope('loss') as scope: reconstruction_loss, latent_loss = util.vae_loss(inputs, outputs, means, log_scales, 'bernoulli') total_loss = reconstruction_loss + tf.reduce_mean(latent_loss) ################ # Training VAE # ################ global_step_tensor = tf.get_variable('global_step', trainable=False, shape=[], initializer=tf.zeros_initializer)
argparser.add_argument('--draw', action='store_true', help='whether draw output') args = argparser.parse_args() model_path = os.path.join('model', args.model + '.ckpt') x = tf.placeholder(tf.float32, [None, 28 * 28]) global_step = tf.Variable(0, name='global_step', trainable=False) mnist = read_data_sets('tmp/MNIST_data') with tf.Session() as sess: c, _ = model.encoder(x) x_, _ = model.decoder(c) loss = model.loss(x, x_) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, model_path) print('"%s" loaded' % (model_path)) eval_x_, eval_loss, step = sess.run([x_, loss, global_step], feed_dict={x: mnist.test.images}) print('loss: %g' % (eval_loss)) if args.draw: dirpath = os.path.join('tmp', args.model, str(step)) if not os.path.exists(dirpath):
def train(z_dim=None, model_name=None): """ Used to train the autoencoder by passing in the necessary inputs. :param train_model: True -> Train the model, False -> Load the latest trained model and show the image grid. :return: does not return anything """ X_train, y_train = datasets.create_datasets(retrain=0, task="aae_wgan_" + str(z_dim), num_aug=0) batch_size = BATCH_SIZE input_dim = X_train.shape[-1] with tf.device("/gpu:0"): sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) x_input = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim, input_dim, 1], name='Input') x_target = tf.placeholder(dtype=tf.float32, shape=[batch_size, input_dim, input_dim, 1], name='Target') real_distribution = tf.placeholder(dtype=tf.float32, shape=[batch_size, z_dim], name='Real_distribution') decoder_input = tf.placeholder(dtype=tf.float32, shape=[1, z_dim], name='Decoder_input') encoder_output = encoder(x_input, reuse=False, is_train=True) encoder_output_test = encoder(x_input, reuse=True, is_train=False) d_fake, d_fake_logits = discriminator(encoder_output, reuse=False) d_real, d_real_logits = discriminator(real_distribution, reuse=True) d_fake_test, d_fake_logits_test = discriminator(encoder_output, reuse=True) d_real_test, d_real_logits_test = discriminator(real_distribution, reuse=True) decoder_output, std = decoder(encoder_output, reuse=False, is_train=True) encoder_output_z = encoder(decoder_output, reuse=True, is_train=False) decoder_output_test, std_ = decoder(encoder_output, reuse=True, is_train=False) encoder_output_z_test = encoder(decoder_output_test, reuse=True, is_train=False) #decoder_image = decoder(decoder_input, reuse=True, is_train=False) # Autoencoder loss # summed = tf.reduce_mean(tf.square(decoder_output-x_target),[1,2,3]) summed = tf.reduce_sum(tf.square(decoder_output - x_target), [1, 2, 3]) # sqrt_summed = summed sqrt_summed = tf.sqrt(summed + 1e-8) autoencoder_loss = tf.reduce_mean(sqrt_summed) summed_test = tf.reduce_sum(tf.square(decoder_output_test - x_target), [1, 2, 3]) # sqrt_summed_test = summed_test sqrt_summed_test = tf.sqrt(summed_test + 1e-8) autoencoder_loss_test = tf.reduce_mean(sqrt_summed_test) # l2 loss of z enc = tf.reduce_sum(tf.square(encoder_output - encoder_output_z), [1]) encoder_l2loss = tf.reduce_mean(enc) enc_test = tf.reduce_sum( tf.square(encoder_output_test - encoder_output_z_test), [1]) encoder_l2loss_test = tf.reduce_mean(enc_test) dc_loss = tf.reduce_mean(d_real_logits - d_fake_logits) dc_loss_test = tf.reduce_mean(d_real_logits_test - d_fake_logits_test) with tf.name_scope("Gradient_penalty"): eta = tf.placeholder(tf.float32, shape=[batch_size, 1], name="Eta") interp = eta * real_distribution + (1 - eta) * encoder_output _, c_interp = discriminator(interp, reuse=True) # taking the zeroth and only element because tf.gradients returns a list c_grads = tf.gradients(c_interp, interp)[0] # L2 norm, reshaping to [batch_size] slopes = tf.sqrt(tf.reduce_sum(tf.square(c_grads), axis=[1])) tf.summary.histogram("Critic gradient L2 norm", slopes) grad_penalty = tf.reduce_mean((slopes - 1)**2) lambd = 10.0 dc_loss += lambd * grad_penalty # Generator loss # generator_loss = tf.reduce_mean( # tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(d_fake), logits=d_fake_logits)) generator_loss = tf.reduce_mean(d_fake_logits) generator_loss_test = tf.reduce_mean(d_fake_logits_test) all_variables = tf.trainable_variables() dc_var = tl.layers.get_variables_with_name('Discriminator', True, True) en_var = tl.layers.get_variables_with_name('Encoder', True, True) #print en_var # dc_var = [var for var in all_variables if 'dc' in var.name] # en_var = [var for var in all_variables if 'encoder' in var.name] var_grad_autoencoder = tf.gradients(autoencoder_loss, all_variables)[0] var_grad_discriminator = tf.gradients(dc_loss, dc_var)[0] var_grad_generator = tf.gradients(generator_loss, en_var)[0] # Optimizers with tf.device("/gpu:0"): autoencoderl2_optimizer = tf.train.AdamOptimizer( learning_rate=LR, beta1=0.5, beta2=0.9).minimize(autoencoder_loss + 0.5 * encoder_l2loss) autoencoder_optimizer = tf.train.AdamOptimizer( learning_rate=LR, beta1=0.5, beta2=0.9).minimize(autoencoder_loss) discriminator_optimizer = tf.train.AdamOptimizer( learning_rate=LR, beta1=0.5, beta2=0.9).minimize(dc_loss, var_list=dc_var) generator_optimizer = tf.train.AdamOptimizer(learning_rate=LR, beta1=0.5, beta2=0.9).minimize( generator_loss, var_list=en_var) tl.layers.initialize_global_variables(sess) # Reshape immages to display them input_images = tf.reshape(x_input, [-1, input_dim, input_dim, 1]) generated_images = tf.reshape(decoder_output, [-1, input_dim, input_dim, 1]) # generated_images = tf.reshape(decoder_output, [-1, 28, 28, 1]) tensorboard_path, saved_model_path, log_path, folder_name = form_results( ) # bp() writer = tf.summary.FileWriter(logdir=tensorboard_path, graph=sess.graph) # Tensorboard visualization tf.summary.scalar(name='Autoencoder Loss', tensor=autoencoder_loss) tf.summary.scalar(name='Autoencoder Test Loss', tensor=autoencoder_loss_test) tf.summary.scalar(name='Discriminator Loss', tensor=dc_loss) tf.summary.scalar(name='Generator Loss', tensor=generator_loss) tf.summary.scalar(name='Autoencoder z Loss', tensor=encoder_l2loss) tf.summary.histogram(name='Encoder Distribution', values=encoder_output) tf.summary.histogram(name='Real Distribution', values=real_distribution) tf.summary.histogram(name='Gradient AE', values=var_grad_autoencoder) tf.summary.histogram(name='Gradient D', values=var_grad_discriminator) tf.summary.histogram(name='Gradient G', values=var_grad_generator) tf.summary.image(name='Input Images', tensor=input_images, max_outputs=10) tf.summary.image(name='Generated Images', tensor=generated_images, max_outputs=10) summary_op = tf.summary.merge_all() saver = tf.train.Saver() # Saving the model step = 0 # with tf.Session() as sess: with open(log_path + '/log.txt', 'a') as log: log.write("input_dim: {}\n".format(input_dim)) log.write("z_dim: {}\n".format(z_dim)) log.write("batch_size: {}\n".format(batch_size)) log.write("\n") for i in range(EPOCHS): b = 0 for batch in tl.iterate.minibatches(inputs=X_train, targets=np.zeros(X_train.shape), batch_size=batch_size, shuffle=True): z_real_dist = np.random.normal(0, 1, (batch_size, z_dim)) * 1. z_real_dist = z_real_dist.astype("float32") batch_x, _ = batch batch_x = batch_x[:, :, :, np.newaxis] #lambda_x = np.max(lambda_grow_max / np.float(i), lambda_grow_max) sess.run(autoencoderl2_optimizer, feed_dict={ x_input: batch_x, x_target: batch_x }) if i < 20: # sess.run(autoencoder_optimizer, feed_dict={x_input: batch_x, x_target: batch_x}) for t in range(10): for _ in range(20): eta1 = np.random.rand( batch_size, 1) # sampling from uniform distribution eta1 = eta1.astype("float32") sess.run(discriminator_optimizer, feed_dict={ x_input: batch_x, x_target: batch_x, real_distribution: z_real_dist, eta: eta1 }) else: # sess.run(autoencoderl2_optimizer, feed_dict={x_input: batch_x, x_target: batch_x}) for _ in range(20): eta1 = np.random.rand( batch_size, 1) # sampling from uniform distribution eta1 = eta1.astype("float32") sess.run(discriminator_optimizer, feed_dict={ x_input: batch_x, x_target: batch_x, real_distribution: z_real_dist, eta: eta1 }) sess.run(generator_optimizer, feed_dict={ x_input: batch_x, x_target: batch_x }) if b % 50 == 0: a_loss, e_loss, d_loss, g_loss, a_grad, d_grad, g_grad, en_output, d_real_logits_, d_fake_logits_, de_output, summary = sess.run( [ autoencoder_loss, encoder_l2loss, dc_loss, generator_loss, var_grad_autoencoder, var_grad_discriminator, var_grad_generator, encoder_output, d_real_logits, d_fake_logits, decoder_output, summary_op ], feed_dict={ x_input: batch_x, x_target: batch_x, real_distribution: z_real_dist, eta: eta1 }) print(model_name) saver.save(sess, save_path=saved_model_path, global_step=step) writer.add_summary(summary, global_step=step) print("Epoch: {}, iteration: {}".format(i, b)) print("Autoencoder Loss: {}".format(a_loss)) print("Autoencoder enc Loss: {}".format(e_loss)) print("Discriminator Loss: {}".format(d_loss)) print("Generator Loss: {}".format(g_loss)) with open(log_path + '/log.txt', 'a') as log: log.write("Epoch: {}, iteration: {}\n".format(i, b)) log.write("Autoencoder Loss: {}\n".format(a_loss)) log.write("Autoencoder enc Loss: {}\n".format(e_loss)) log.write("Discriminator Loss: {}\n".format(d_loss)) log.write("Generator Loss: {}\n".format(g_loss)) b += 1 step += 1 b = 0 for batch in tl.iterate.minibatches(inputs=y_train, targets=np.zeros(y_train.shape), batch_size=batch_size, shuffle=True): z_real_dist = np.random.normal(0, 1, (batch_size, z_dim)) * 1. z_real_dist = z_real_dist.astype("float32") batch_x, _ = batch batch_x = batch_x[:, :, :, np.newaxis] eta1 = np.random.rand(batch_size, 1) if b % 20 == 0: a_loss, e_loss, d_loss, g_loss = sess.run( [ autoencoder_loss_test, encoder_l2loss_test, dc_loss_test, generator_loss_test ], feed_dict={ x_input: batch_x, x_target: batch_x, real_distribution: z_real_dist, eta: eta1 }) print("v_Epoch: {}, iteration: {}".format(i, b)) print("v_Autoencoder Loss: {}".format(a_loss)) print("v_Autoencoder enc Loss: {}".format(e_loss)) print("v_Discriminator Loss: {}".format(d_loss)) print("v_Generator Loss: {}".format(g_loss)) with open(log_path + '/log.txt', 'a') as log: log.write("v_Epoch: {}, iteration: {}\n".format(i, b)) log.write("v_Autoencoder Loss: {}\n".format(a_loss)) log.write("v_Autoencoder enc Loss: {}\n".format(e_loss)) log.write("v_Discriminator Loss: {}\n".format(d_loss)) log.write("v_Generator Loss: {}\n".format(g_loss))
def train(epoch): # model is a class that inherits nn.Module # This puts the model in train mode as opposed to eval mode, so it knows which one to use. print("check 5") model.encoder.train() #print(" check lalala") model.decoder.train() print("check 6") # print(model.fc) # For each batch of training images, cum_train_loss = 0 cum_val_loss = 0 for batch_idx, batch in enumerate(train_loader): # Read images and their target labels in the current batch. images, captions, lengths = Variable(batch[0]), Variable( batch[1]), batch[2] targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Load the current training example in the CUDA core if available. if args.cuda: images = images.cuda() features = model.encoder(images) output = model.decoder(features, captions, lengths) criterion = torch.nn.CrossEntropyLoss() loss = criterion(output, targets) model.decoder.zero_grad() model.encoder.zero_grad() loss.backward() optimizer.step() pass cum_train_loss += loss # Print out the loss and accuracy on the first 10 batches of the validation set. # adjusting the printing frequency by changing --log-interval option in the command-line. if batch_idx % args.log_interval == 0: # Compute the average validation loss and accuracy. val_loss = evaluate('val', n_batches=10) # Compute the training loss. train_loss = loss.data.item() # Compute the number of examples in this batch. examples_this_epoch = batch_idx * len(images) # Compute the progress rate in terms of the batch. epoch_progress = 100. * batch_idx / len(train_loader) # Print out the training loss, validation loss, and accuracy with epoch information. print('Train Epoch: {} [{}/{} ({:.0f}%)]\t' 'Train Loss: {:.6f}\tVal Loss:{:.6f}\t'.format( epoch, examples_this_epoch, len(train_loader.dataset), epoch_progress, train_loss, val_loss)) cum_val_loss += val_loss avg_val_loss = cum_val_loss / (batch_idx + 1) avg_train_loss = cum_train_loss / (batch_idx + 1) print('Train Epoch: {}\t' 'Avg Train Loss: {:.6f}\t Val Loss:{:.6f}\t'.format( epoch, avg_train_loss, avg_val_loss))
orgin = tf.reshape(x, (x.shape[0], -1)) reconstruct_loss = 0.0005*tf.reduce_mean(tf.square(orgin-decoded)) total_loss = margin_loss+reconstruct_loss return total_loss if __name__ == "__main__": g = tf.get_default_graph() ds, ds_val = mnist_dataset() iterator = ds.make_one_shot_iterator() next_x, next_y = iterator.get_next() batch_x = tf.placeholder_with_default(next_x, shape=[100, 28, 28, 1]) batch_y = tf.placeholder_with_default(next_y, shape=[100, 10]) logits, caps_out = capsnet(batch_x) decoded = decoder(caps_out, batch_y) """ define loss """ loss = calc_loss(logits, caps_out, batch_x, batch_y, decoded) """ define summary """ acc_op, acc = tf.metrics.accuracy(tf.argmax(batch_y, -1), tf.argmax(logits, -1)) tf.summary.scalar('loss', loss) tf.summary.scalar('acc', acc) tf.summary.image('reconstruction_img', tf.reshape(decoded, (100, 28, 28, 1))) summ = tf.summary.merge_all() """ define train op """ steps = tf.train.get_or_create_global_step(g) train_op = tf.train.AdamOptimizer().minimize(loss, global_step=steps) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess:
# Initialize image batch imBatch = Variable(torch.FloatTensor(opt.batchSize, 3, 300, 300) ) labelBatch = Variable(torch.FloatTensor(opt.batchSize, opt.numClasses, 300, 300) ) maskBatch = Variable(torch.FloatTensor(opt.batchSize, 1, 300, 300) ) labelIndexBatch = Variable(torch.LongTensor(opt.batchSize, 1, 300, 300) ) # Initialize network if opt.isDilation: encoder = model.encoderDilation() decoder = model.decoderDilation() elif opt.isSpp: encoder = model.encoderSPP() decoder = model.decoderSPP() else: encoder = model.encoder() decoder = model.decoder() encoder.load_state_dict(torch.load('%s/encoder_%d.pth' % (opt.modelRoot, opt.epochId) ) ) decoder.load_state_dict(torch.load('%s/decoder_%d.pth' % (opt.modelRoot, opt.epochId) ) ) encoder = encoder.eval() decoder = decoder.eval() # Move network and containers to gpu if not opt.noCuda: imBatch = imBatch.cuda(opt.gpuId ) labelBatch = labelBatch.cuda(opt.gpuId ) labelIndexBatch = labelIndexBatch.cuda(opt.gpuId ) maskBatch = maskBatch.cuda(opt.gpuId ) encoder = encoder.cuda(opt.gpuId ) decoder = decoder.cuda(opt.gpuId )
# get data xtr, ytr, xte, yte = mnist_1000(args.mnist_path) # placeholders x = tf.placeholder(tf.float32, [None, 784]) n_train_batches = int(1000/args.batch_size) n_test_batches = int(1000/args.batch_size) # models net = autoencoder(x, args.zdim, True) # train tnet = autoencoder(x, args.zdim, False, reuse=True) # test # for visualization z = tf.placeholder(tf.float32, [None, args.zdim]) tennet = encoder(x, args.zdim, reuse=True) # test encoder tdenet = decoder(z, reuse=True) # test decoder def train(): loss = -net['elbo'] # negative ELBO global_step = tf.train.get_or_create_global_step() lr = tf.train.piecewise_constant(tf.cast(global_step, tf.int32), [int(n_train_batches*args.n_epochs/2)], [1e-3, 1e-4]) train_op = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step) saver = tf.train.Saver(net['weights']) logfile = open(os.path.join(savedir, 'train.log'), 'w') sess = tf.Session() sess.run(tf.global_variables_initializer())
def py_infer(test_data, trg_idx2word, use_wordpiece): """ Inference by beam search implented by python, while the calculations from symbols to probilities execute by Fluid operators. """ place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) encoder_program = fluid.Program() with fluid.program_guard(main_program=encoder_program): enc_output = encoder( ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.weight_sharing) decoder_program = fluid.Program() with fluid.program_guard(main_program=decoder_program): predict = decoder( ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.weight_sharing) # Load model parameters of encoder and decoder separately from the saved # transformer model. encoder_var_names = [] for op in encoder_program.block(0).ops: encoder_var_names += op.input_arg_names encoder_param_names = filter( lambda var_name: isinstance( encoder_program.block(0).var(var_name), fluid.framework.Parameter), encoder_var_names) encoder_params = map(encoder_program.block(0).var, encoder_param_names) decoder_var_names = [] for op in decoder_program.block(0).ops: decoder_var_names += op.input_arg_names decoder_param_names = filter( lambda var_name: isinstance( decoder_program.block(0).var(var_name), fluid.framework.Parameter), decoder_var_names) decoder_params = map(decoder_program.block(0).var, decoder_param_names) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params) # This is used here to set dropout to the test mode. encoder_program = encoder_program.inference_optimize() decoder_program = decoder_program.inference_optimize() for batch_id, data in enumerate(test_data.batch_generator()): batch_seqs, batch_scores = translate_batch( exe, [item[0] for item in data], encoder_program, encoder_data_input_fields + encoder_util_input_fields, [enc_output.name], decoder_program, decoder_data_input_fields[:-1] + decoder_util_input_fields + (decoder_data_input_fields[-1], ), [predict.name], InferTaskConfig.beam_size, InferTaskConfig.max_out_len, InferTaskConfig.n_best, len(data), ModelHyperParams.n_head, ModelHyperParams.d_model, ModelHyperParams.eos_idx, # Use eos_idx to pad. ModelHyperParams.eos_idx, # Use eos_idx to pad. ModelHyperParams.bos_idx, ModelHyperParams.eos_idx, ModelHyperParams.unk_idx, output_unk=InferTaskConfig.output_unk) for i in range(len(batch_seqs)): # Post-process the beam-search decoded sequences. seqs = map(post_process_seq, batch_seqs[i]) scores = batch_scores[i] for seq in seqs: if use_wordpiece: print(util.subword_ids_to_str(seq, trg_idx2word)) else: print(" ".join([trg_idx2word[idx] for idx in seq]))
width=opt.width, keep_ratio=opt.keep_ratio)) val_dataset = dataset.listDataset(list_file=opt.valList, transform=dataset.resizeNormalize( (opt.width, opt.height))) nclass = len(alphabet) + 3 # decoder的时候,需要的类别数,3 for SOS,EOS和blank nc = 1 converter = utils.strLabelConverterForAttention(alphabet) image = torch.FloatTensor(opt.batchSize, 3, opt.width, opt.height) criterion = torch.nn.NLLLoss() # 最后的输出要为log_softmax encoder = model.encoder(opt.height, nc=nc, nh=256) decoder = model.decoder(nh=256, nclass=nclass, dropout_p=0.1) # continue training or use the pretrained model to initial the parameters of the encoder and decoder encoder.apply(weights_init) decoder.apply(weights_init) if opt.encoder: print('loading pretrained encoder model from %s' % opt.encoder) encoder.load_state_dict(torch.load(opt.encoder)) if opt.decoder: print('loading pretrained decoder model from %s' % opt.decoder) decoder.load_state_dict(torch.load(opt.decoder)) if opt.loadModelEpoch > 0: encoder_path = 'model/encoder_%d.pth' % opt.loadModelEpoch print('loading pretrained encoder model from %s' % encoder_path) encoder.load_state_dict(torch.load(encoder_path)) decoder_path = 'model/decoder_%d.pth' % opt.loadModelEpoch
if torch.cuda.is_available() and opt.noCuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) # Initialize image batch imBatch = Variable(torch.FloatTensor(opt.batchSize, 3, 300, 300)) labelBatch = Variable( torch.FloatTensor(opt.batchSize, opt.numClasses, 300, 300)) maskBatch = Variable(torch.FloatTensor(opt.batchSize, 1, 300, 300)) labelIndexBatch = Variable(torch.LongTensor(opt.batchSize, 1, 300, 300)) # Initialize network encoder_normal = model.encoder() decoder_normal = model.decoder() model_root_normal = '/datasets/cse152-252-sp20-public/unet_checkpoints/unet_original_zq' epoch_id_normal = 181 encoder_normal.load_state_dict( torch.load('%s/encoder_%d.pth' % (model_root_normal, epoch_id_normal))) decoder_normal.load_state_dict( torch.load('%s/decoder_%d.pth' % (model_root_normal, epoch_id_normal))) encoder_normal = encoder_normal.eval() decoder_normal = decoder_normal.eval() encoder_dilation = model.encoderDilation() decoder_dilation = model.decoderDilation() model_root_dilation = '/datasets/cse152-252-sp20-public/unet_checkpoints/unet_original_zq_dilation' epoch_id_dilation = 180 encoder_dilation.load_state_dict( torch.load('%s/encoder_%d.pth' % (model_root_dilation, epoch_id_dilation)))
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() model_r.train() model_mlp.train() data, targets, _ = get_batch(train_data, i, args, seq_len=seq_len) data_long, _, _ = get_batch(train_data, i, args, seq_len=seq_len) seq_len_data = data.size(0) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) output = model.decoder(output) input_emb = model.encoder(data) # input_emb = model.encoder(data).detach() # input_emb = model.encoder(data_long) # input_emb = model.encoder(data_long).detach() # input_emb_nhid = model_mlp(input_emb) attention, seq_len_data, reg_len = model_r(input_emb, seq_len_data) span_emb = (input_emb.unsqueeze(0) * attention).sum(1) # span_emb = (input_emb_nhid.unsqueeze(0) * attention).sum(1) span_emb = model_mlp(span_emb) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) context_emb = dropped_rnn_hs[-2][:seq_len_data] if args.ns: span_emb_t = span_emb.transpose(0, 1) pos_loss = (1 - (context_emb * span_emb).sum(2).sigmoid()).mean() neg_loss = 0 split_idx_batch = int(torch.randint(args.batch_size, [])) # split_idx_batch = int(torch.randint(1, args.batch_size, [])) least_ns_seq = 0 if split_idx_batch == 0: least_ns_seq = 10 if data.size(0) > 15 else int( data.size(0) / 2) split_idx_seq = int(torch.randint(least_ns_seq, data.size(0), [])) for j in range(1): span_emb_neg = torch.cat([ span_emb_t[split_idx_batch:], span_emb_t[:split_idx_batch] ], 0).transpose(0, 1) span_emb_neg = torch.cat([ span_emb_neg[split_idx_seq:], span_emb_neg[:split_idx_seq] ], 0) neg_loss += (context_emb * span_emb_neg).sum(2).sigmoid().mean() # split_idx_batch = int(torch.randint(args.batch_size, [])) # split_idx_batch = int(torch.randint(1, args.batch_size, [])) # least_ns_seq = 0 # if split_idx_batch == 0: # least_ns_seq = 10 if data.size(0) > 15 else int(data.size(0) / 2) # split_idx_seq = int(torch.randint(least_ns_seq, data.size(0), [])) loss += args.theta * (pos_loss + neg_loss) # + 1e-6 * reg_len else: loss = loss + args.theta * (context_emb - span_emb).pow(2).mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
def create_generation_batch(model, num_words, random_choice_frequency, trunc_size, bs, bptt, prompts, params, TEXT): """ Generate a batch of musical samples Input: model - pretrained generator model num_words - number of steps to generate random_choice_frequency - how often to pick a random choice rather than the top choice (range 0 to 1) trunc_size - for the random choice, cut off the options to include only the best trunc_size guesses (range 1 to vocab_size) bs - batch size - number of samples to generate bptt - back prop through time - size of prompt prompts - a list of training or test folder texts params - parameters of the generator model TEXT - holds vocab word to index dictionary Output: musical_prompts - the randomly selected prompts that were used to prime the model (these are human-composed samples) results - the generated samples This is very loosely based on an example in the FastAI notebooks, but is modified to include randomized prompts, to generate a batch at a time rather than a single example, and to include truncated random sampling. """ with torch.no_grad(): hidden = model.init_hidden(bs) musical_prompts = generate_musical_prompts(prompts, bptt, bs) results = [''] * bs model.eval() # Tokenize prompts and translate them to indices for input into model s = [music_tokenizer(prompt)[:bptt] for prompt in musical_prompts] t = TEXT.numericalize(s) print("Prompting network") # Feed the prompt one by one into the model (b is a vector of all the indices for each prompt at a given timestep) for b in t: res, hidden = model(b.unsqueeze(0).cuda(), hidden) print("Generating new sample") for i in range(num_words): res = model.decoder(res) # res holds the probabilities the model predicted given the input sequence # n_tok is the number of tokens (ie the vocab size) [ps, n] = res.topk(params["n_tok"]) # By default, choose the most likely word (choice 0) for the next timestep (for all the samples in the batch) w = n[:, 0] # Cycle through the batch, randomly assign some of them to choose from the top trunc guesses, rather than to # automatically take the top choice for j in range(bs): """ if random.random()<random_choice_frequency: # Truncate to top trunc_size guesses only ps=ps[:,:trunc_size] # Sample based on the probability the model predicted for those top choices r=torch.multinomial(ps[j].exp(), 1) # Translate this to an index #TODO: need to figure it out ind=to_np(r[0])[0] if ind!=0: w[j].data[0]=n[j,ind].data[0] """ # Translate the index back to a word (itos is index to string) # Append to the ongoing sample results[j] += TEXT.vocab.itos[w[j].item()] + " " # Feed all the predicted words from this timestep into the model, in order to get predictions for the next step res, hidden = model(w.unsqueeze(0).cuda(), hidden) return musical_prompts, results
# ----------------- calculate the number of batches per epoch -------------------- batch_per_ep = input_file.shape[ 0] // batch_size # batch per epoch will be 40 [input total= 400 / 10 ] ae_inputs = tf.placeholder(tf.float32, (None, 32, 32, 32, 1), name="encoder_input") # input to the network #dicForShape = tf.placeholder(tf.string, shape=None, name="volume_name") # ---------for variational auto encoder(this has to be commented when simple auto encoder model is used) -------------- #z_mean, z_std, l_space = md.encoder(ae_inputs) # ---------for simple auto encoder(this has to be commented when variational model is used) -------------- l_space = md.encoder(ae_inputs, dim_of_z) # --------- Output from decoder --------------------- ae_outputs = md.decoder(l_space) # ----------------- calculate the loss and optimize variational auto encoder network ------------------------ #generation_loss = -tf.reduce_sum(ae_inputs * tf.log(1e-8 + ae_outputs) + (1-ae_inputs) * tf.log(1e-8 + 1 - ae_outputs), 1) #latent_loss = 0.5 * tf.reduce_sum(tf.square(z_mean) + tf.square(z_std) - tf.log(tf.square(z_std)) - 1,1) # Voxel-Wise Reconstruction Loss # Note that the output values are clipped to prevent the BCE from evaluating log(0). '''ae_outputs = tf.clip_by_value(ae_outputs, 1e-8, 1 - 1e-8) bce_loss = tf.reduce_sum(weighted_binary_crossentropy(ae_outputs, ae_inputs), [1,2]) bce_loss = tf.reduce_mean(bce_loss) # KL Divergence from isotropic gaussian prior kl_div = 0.5 * tf.reduce_sum(tf.square(z_mean) + tf.square(z_std) - tf.log(1e-8 + tf.square(z_std)) - 1, [1]) kl_div = tf.reduce_mean(kl_div)