def cal_metrics(self, fmt_str=False): """ Calculate metrics :param fmt_str: if return format string for logging """ eval_samples = self.gen.sample(cfg.samples_num, 4 * cfg.batch_size) self.gen_data.reset(eval_samples) new_gen_tokens = tensor_to_tokens(eval_samples, self.index_word_dict) self.bleu.test_text = new_gen_tokens self.self_bleu.real_text = new_gen_tokens self.self_bleu.test_text = tensor_to_tokens(self.gen.sample(200, 200), self.index_word_dict) # BLEU-[2,3,4,5] bleu_score = self.bleu.get_score(ignore=False) # Self-BLEU self_bleu_score = self.self_bleu.get_score(ignore=False) # NLL_gen gen_nll = self.eval_gen(self.gen, self.train_data.loader, self.mle_criterion) if fmt_str: return 'BLEU-%s = %s, gen_NLL = %.4f, self_bleu = %s,' % ( self.bleu.gram, bleu_score, gen_nll, self_bleu_score) return bleu_score, gen_nll, self_bleu_score
def __init__(self, opt): super(RelGANInstructor, self).__init__(opt) # generator, discriminator self.gen = RelGAN_G(cfg.mem_slots, cfg.num_heads, cfg.head_size, cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, gpu=cfg.CUDA) self.dis = RelGAN_D(cfg.dis_embed_dim, cfg.max_seq_len, cfg.num_rep, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_adv_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.adv_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size)) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=[2, 3, 4, 5]) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def cal_metrics_with_label(self, label_i): assert type(label_i) == int, 'missing label' with torch.no_grad(): # Prepare data for evaluation eval_samples = self.gen.sample(cfg.samples_num, 8 * cfg.batch_size, label_i=label_i) gen_data = GenDataIter(eval_samples) gen_tokens = tensor_to_tokens(eval_samples, self.idx2word_dict) gen_tokens_s = tensor_to_tokens( self.gen.sample(200, 200, label_i=label_i), self.idx2word_dict) clas_data = CatClasDataIter([eval_samples], label_i) # Reset metrics self.bleu.reset(test_text=gen_tokens, real_text=self.test_data_list[label_i].tokens) self.nll_gen.reset(self.gen, self.train_data_list[label_i].loader, label_i) self.nll_div.reset(self.gen, gen_data.loader, label_i) self.self_bleu.reset(test_text=gen_tokens_s, real_text=gen_tokens) self.clas_acc.reset(self.clas, clas_data.loader) self.ppl.reset(gen_tokens) return [metric.get_score() for metric in self.all_metrics]
def cal_metrics(self, fmt_str=False): """ Calculate metrics :param fmt_str: if return format string for logging """ with torch.no_grad(): # Prepare data for evaluation eval_samples = self.gen.sample(cfg.samples_num, 4 * cfg.batch_size) gen_data = GenDataIter(eval_samples) gen_tokens = tensor_to_tokens(eval_samples, self.idx2word_dict) #print(gen_tokens) gen_tokens_s = tensor_to_tokens(self.gen.sample(200, 200), self.idx2word_dict) #print(gen_tokens_s) # Reset metrics self.bleu.reset(test_text=gen_tokens, real_text=self.test_data.tokens) self.nll_gen.reset(self.gen, self.train_data.loader) self.nll_div.reset(self.gen, gen_data.loader) self.self_bleu.reset(test_text=gen_tokens_s, real_text=gen_tokens) self.ppl.reset(gen_tokens) if fmt_str: return ', '.join([ '%s = %s' % (metric.get_name(), metric.get_score()) for metric in self.all_metrics ]) else: return [metric.get_score() for metric in self.all_metrics]
def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.train_data.random_batch()['target']) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=3) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def cal_metrics(self, fmt_str=False): with torch.no_grad(): # Prepare data for evaluation eval_samples = self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis) gen_data = GenDataIter(eval_samples) gen_tokens = tensor_to_tokens(eval_samples, self.idx2word_dict) gen_tokens_s = tensor_to_tokens( self.gen.sample(200, cfg.batch_size, self.dis), self.idx2word_dict) # Reset metrics self.bleu.reset(test_text=gen_tokens, real_text=self.test_data.tokens) self.nll_gen.reset(self.gen, self.train_data.loader, leak_dis=self.dis) self.nll_div.reset(self.gen, gen_data.loader, leak_dis=self.dis) self.self_bleu.reset(test_text=gen_tokens_s, real_text=gen_tokens) self.ppl.reset(gen_tokens) if fmt_str: return ', '.join([ '%s = %s' % (metric.get_name(), metric.get_score()) for metric in self.all_metrics ]) else: return [metric.get_score() for metric in self.all_metrics]
def __init__(self, opt): super(SeqGANInstructor, self).__init__(opt) # generator, discriminator self.gen = SeqGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.temperature, gpu=cfg.CUDA) self.dis = SeqGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size)) self.dis_data = DisDataIter(self.train_data.random_batch()['target'], self.gen_data.random_batch()['target']) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=3) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def __init__(self, opt): super(RelbarGANInstructor, self).__init__(opt) # generator, discriminator self.gen = RelbarGAN_G(cfg.mem_slots, cfg.num_heads, cfg.head_size, cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.temperature, cfg.eta, gpu=cfg.CUDA) self.dis = RelbarGAN_D(cfg.dis_embed_dim, cfg.max_seq_len, cfg.num_rep, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(itertools.chain( self.gen.parameters(), [self.gen.temperature, self.gen.eta]), lr=cfg.gen_adv_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_pretrain_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size)) self.dis_data = DisDataIter(self.train_data.random_batch()['target'], self.gen_data.random_batch()['target']) # Metrics bleu_gram = list(range(2, cfg.max_seq_len + 1)) if cfg.max_seq_len < 5 else [2, 3, 4, 5] self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens( self.test_data.target, self.test_data.index_word_dict), gram=bleu_gram) self.self_bleu = BLEU( test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def cal_metrics(self, fmt_str=False): self.gen_data.reset( self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis)) self.bleu.test_text = tensor_to_tokens(self.gen_data.target, self.index_word_dict) bleu_score = self.bleu.get_score(ignore=False) with torch.no_grad(): gen_nll = 0 for data in self.train_data.loader: inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() loss = self.gen.batchNLLLoss(target, self.dis) gen_nll += loss.item() gen_nll /= len(self.train_data.loader) if fmt_str: ''' print('bleu_score:\n') print(bleu_score) print('gen_nll:\n') print(gen_nll) ''' return 'BLEU-3 = %.4f, gen_NLL = %.4f,' % (bleu_score[0], gen_nll) return bleu_score, gen_nll
def _save(self, phase, epoch): """Save model state dict and generator's samples""" if phase != 'ADV': torch.save(self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_{}_{:05d}.txt'.format(phase, epoch) samples = self.gen.sample(10000, cfg.batch_size) write_tokens(save_sample_path, tensor_to_tokens(samples, self.idx2word_dict))
def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) #LSTM self.corpus = dataa.Corpus('dataset/emnlp_news/') self.lstm = LSTM.RNNModel('LSTM', len(self.corpus.dictionary), 200, 600, 3, 0.2, False) if (cfg.CUDA): self.dis.cuda() self.gen.cuda() self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.oracle_data.random_batch()['target']) # Metrics self.bleu3 = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.index_word_dict), gram=3)
def _save(self, phase, epoch): """Save model state dict and generator's samples""" for i in range(cfg.k_label): if phase != 'ADV': torch.save(self.gen_list[i].state_dict(), cfg.save_model_root + 'gen{}_{}_{:05d}.pt'.format(i, phase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_d{}_{}_{:05d}.txt'.format(i, phase, epoch) samples = self.gen_list[i].sample(cfg.batch_size, cfg.batch_size) write_tokens(save_sample_path, tensor_to_tokens(samples, self.idx2word_dict))
def _save(self, phase, epoch): torch.save( self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_{}_{:05d}.txt'.format( phase, epoch) samples = self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis) write_tokens(save_sample_path, tensor_to_tokens(samples, self.idx2word_dict))
def _save(self, phase, epoch, label_i=None): assert type(label_i) == int torch.save( self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_c{}_{}_{:05d}.txt'.format( label_i, phase, epoch) samples = self.gen.sample(cfg.batch_size, cfg.batch_size, label_i=label_i) write_tokens(save_sample_path, tensor_to_tokens(samples, self.idx2word_dict))
def cal_metrics(self, fmt_str=False): self.gen_data.reset( self.gen.sample(cfg.samples_num, 4 * cfg.batch_size)) self.bleu3.test_text = tensor_to_tokens(self.gen_data.target, self.index_word_dict) bleu3_score = self.bleu3.get_score(ignore=True) gen_nll = self.eval_gen(self.gen, self.oracle_data.loader, self.mle_criterion) if fmt_str: return 'BLEU-3 = %.4f, gen_NLL = %.4f,' % (bleu3_score, gen_nll) return bleu3_score, gen_nll
def evaluation(self, eval_type): """Evaluation all children, update child score. Note that the eval data should be the same""" eval_samples = self.gen.sample(cfg.eval_b_num * cfg.batch_size, cfg.max_bn * cfg.batch_size) gen_data = GenDataIter(eval_samples) # Fd if cfg.lambda_fd != 0: Fd = NLL.cal_nll(self.gen, gen_data.loader, self.mle_criterion) # NLL_div else: Fd = 0 # Fq if eval_type == 'standard': Fq = self.eval_d_out_fake.mean().cpu().item() elif eval_type == 'rsgan': g_loss, d_loss = get_losses(self.eval_d_out_real, self.eval_d_out_fake, 'rsgan') Fq = d_loss.item() elif 'bleu' in eval_type: self.bleu.reset( test_text=tensor_to_tokens(eval_samples, self.idx2word_dict)) if cfg.lambda_fq != 0: Fq = self.bleu.get_score(given_gram=int(eval_type[-1])) else: Fq = 0 elif 'Ra' in eval_type: g_loss = torch.sigmoid(self.eval_d_out_fake - torch.mean(self.eval_d_out_real)).sum() Fq = g_loss.item() else: raise NotImplementedError("Evaluation '%s' is not implemented" % eval_type) score = cfg.lambda_fq * Fq + cfg.lambda_fd * Fd return Fq, Fd, score
def leakGAN_layer(self, secret_file, final_file, bins_num): #Second Layer = LeakGAN layer print('>>> Begin Second Layer...') data_root = './decode/' torch.nn.Module.dump_patches = True epoch_start_time = time.time() # Set the random seed manually for reproducibility. seed = 1111 #Step 1: load the most accurate model with open("leakGAN_instructor/real_data/gen_ADV_00028.pt", 'rb') as f: self.gen.load_state_dict(torch.load(f)) print("Finish Loading") self.gen.eval() #Step 1: Get Intermediate text secret_file = data_root + secret_file secret_file = open(secret_file, 'r') secret_data = secret_file.read().split() #Step 2: Compress string into binary string bit_string = '' #You need LSTM Corpus for that emnlp_data = 'dataset/emnlp_news/' corpus = dataa.Corpus(emnlp_data) for data in secret_data: print("Data: {}".format(data)) idWord = corpus.dictionary.word2idx[data] bit_string += '{0:{fill}13b}'.format(int(idWord), fill='0') secret_text = [ int(i, 2) for i in self.string2bins(bit_string, bins_num) ] #convert to bins corpus_leak = self.index_word_dict if bins_num >= 2: ntokens = len(corpus_leak) tokens = list(range(ntokens)) # * args.replication_factor #print(ntokens) random.shuffle(tokens) #Words in each bin words_in_bin = int(ntokens / bins_num) #leftovers should be also included in the leftover = int(ntokens % bins_num) bins = [ tokens[i:i + words_in_bin] for i in range(0, ntokens - leftover, words_in_bin) ] # words to keep in each bin for i in range(0, leftover): bins[i].append(tokens[i + words_in_bin * bins_num]) #save bins into leakGAN key key2 = data_root + 'leakGAN_key.txt' with open(key2, "wb") as fp: #Pickling pickle.dump(bins, fp) zero = [list(set(tokens) - set(bin_)) for bin_ in bins] print('Finished Initializing Second LeakGAN Layer') print('time: {:5.2f}s'.format(time.time() - epoch_start_time)) print('-' * 89) out_file = data_root + final_file w = 0 i = 1 bin_sequence_length = len(secret_text[:]) print("bin sequence length", bin_sequence_length) batch_size = cfg.batch_size seq_len = cfg.max_seq_len feature_array = torch.zeros( (batch_size, seq_len + 1, self.gen.goal_out_size)) goal_array = torch.zeros( (batch_size, seq_len + 1, self.gen.goal_out_size)) leak_out_array = torch.zeros((batch_size, seq_len + 1, cfg.vocab_size)) samples = torch.zeros(batch_size, seq_len + 1).long() work_hidden = self.gen.init_hidden(batch_size) mana_hidden = self.gen.init_hidden(batch_size) leak_inp = torch.LongTensor([cfg.start_letter] * batch_size) real_goal = self.gen.goal_init[:batch_size, :] if cfg.CUDA: feature_array = feature_array.cuda() goal_array = goal_array.cuda() leak_out_array = leak_out_array.cuda() goal_array[:, 0, :] = real_goal # g0 = goal_init if_sample = True no_log = False index = cfg.start_letter while i <= seq_len: dis_inp = torch.zeros(batch_size, bin_sequence_length).long() if i > 1: dis_inp[:, :i - 1] = samples[:, :i - 1] # cut sentences leak_inp = samples[:, i - 2] if torch.cuda.is_available(): dis_inp = dis_inp.cuda() leak_inp = leak_inp.cuda() feature = self.dis.get_feature(dis_inp).unsqueeze(0) #print(feature) feature_array[:, i - 1, :] = feature.squeeze(0) out, cur_goal, work_hidden, mana_hidden = self.gen(index, leak_inp, work_hidden, mana_hidden, feature, real_goal, no_log=no_log, train=False) leak_out_array[:, i - 1, :] = out goal_array[:, i, :] = cur_goal.squeeze(1) if i > 0 and i % self.gen.step_size == 0: real_goal = torch.sum(goal_array[:, i - 3:i + 1, :], dim=1) if i / self.gen.step_size == 1: real_goal += self.gen.goal_init[:batch_size, :] # Sample one token if not no_log: out = torch.exp(out) zero_index = zero[secret_text[:][ i - 1]] #indecies that has to be zeroed, as they are not in the current bin #zero_index.append(0) zero_index = torch.LongTensor(zero_index) if cfg.CUDA: zero_index = zero_index.cuda() temperature = 1.5 word_weights = out word_weights = word_weights.index_fill_( 1, zero_index, 0) #make all the indecies zero if they are not in the bin word_weights = torch.multinomial(word_weights, 1).view( -1) #choose one word with highest probability for each sample #print("Out after: {}".format(word_weights)) samples[:, i] = word_weights leak_inp = word_weights i += 1 w += 1 leak_out_array = leak_out_array[:, :seq_len, :] tokens = [] write_tokens(out_file, tensor_to_tokens(samples, self.index_word_dict)) print("Generated final steganographic text") print("Final steganographic text saved in following file: {}".format( out_file))