def __init__(self, opt): super(RelGANInstructor, self).__init__(opt) # generator, discriminator self.gen = RelGAN_G(cfg.mem_slots, cfg.num_heads, cfg.head_size, cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, gpu=cfg.CUDA) self.dis = RelGAN_D(cfg.dis_embed_dim, cfg.max_seq_len, cfg.num_rep, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_adv_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.adv_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size)) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=[2, 3, 4, 5]) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def LSTM(): #Step 1: Read data from files and put them into list test_sentences = list() with open(args.test_data, 'r') as f: for line in f: test_sentences.append(line.replace("<eos>", "")) temp = list() for i in range(0, len(test_sentences)): if test_sentences[i] != '\n': temp.append(test_sentences[i]) test_sentences = temp print(test_sentences) real_sentences = list() with codecs.open(args.real_data,'r',encoding='utf8',errors='ignore') as f: for line in f: real_sentences.append(line) #Step 2: BLEU Score print("LSTM - LSTM double layer encoding") for i in range(1, args.gram + 1): bleu = BLEU(test_sentences, real_sentences, i) bleu_score = bleu.get_score(ignore=False) print("BLEU{} score:{}".format(i,bleu_score))
def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.train_data.random_batch()['target']) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=3) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def __init__(self, encoder_layer_num, decoder_layer_num, hidden_dim, batch_size, learning_rate, dropout, init_train = True): self.encoder_layer_num = encoder_layer_num self.decoder_layer_num = decoder_layer_num self.hidden_dim = hidden_dim self.batch_size = batch_size self.learning_rate = learning_rate self.dropout = dropout self.init_train = init_train #---------fix---------- self.vocab_size = cfg.vocab_size self.max_length = cfg.max_length self.embedding_matrix = make_embedding_matrix(cfg.all_captions) self.SOS_token = cfg.SOS_token self.EOS_token = cfg.EOS_token self.idx2word_dict = load_dict() #---------------------- self.bleu = BLEU('BLEU', gram=[2,3,4,5]) #self.bleu.reset(test_text = gen_tokens, real_text = self.test_data.tokens) if init_train: self._init_train() train_week_stock, train_month_stock, t_month_stock,train_input_cap_vector, train_output_cap_vector = load_training_data() self.train_data = batch_generator(train_week_stock, train_month_stock, t_month_stock,train_input_cap_vector, train_output_cap_vector, self.batch_size) self.total_iter = len(train_input_cap_vector) self._init_eval() val_week_stock, val_month_stock, val_t_month_stock,val_input_cap_vector, val_output_cap_vector = load_val_data() self.val_data = batch_generator(val_week_stock, val_month_stock, val_t_month_stock,val_input_cap_vector, val_output_cap_vector, self.batch_size) self.val_total_iter = len(val_input_cap_vector)
def __init__(self, opt): super(SeqGANInstructor, self).__init__(opt) # generator, discriminator self.gen = SeqGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.temperature, gpu=cfg.CUDA) self.dis = SeqGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter(self.gen.sample(cfg.batch_size, cfg.batch_size)) self.dis_data = DisDataIter(self.train_data.random_batch()['target'], self.gen_data.random_batch()['target']) # Metrics self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.test_data.index_word_dict), gram=3) self.self_bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def __init__(self, opt): super(RelbarGANInstructor, self).__init__(opt) # generator, discriminator self.gen = RelbarGAN_G(cfg.mem_slots, cfg.num_heads, cfg.head_size, cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.temperature, cfg.eta, gpu=cfg.CUDA) self.dis = RelbarGAN_D(cfg.dis_embed_dim, cfg.max_seq_len, cfg.num_rep, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # Optimizer self.gen_opt = optim.Adam(self.gen.parameters(), lr=cfg.gen_lr) self.gen_adv_opt = optim.Adam(itertools.chain( self.gen.parameters(), [self.gen.temperature, self.gen.eta]), lr=cfg.gen_adv_lr) self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_pretrain_criterion = nn.BCEWithLogitsLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size)) self.dis_data = DisDataIter(self.train_data.random_batch()['target'], self.gen_data.random_batch()['target']) # Metrics bleu_gram = list(range(2, cfg.max_seq_len + 1)) if cfg.max_seq_len < 5 else [2, 3, 4, 5] self.bleu = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens( self.test_data.target, self.test_data.index_word_dict), gram=bleu_gram) self.self_bleu = BLEU( test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), gram=3)
def __init__(self, opt): self.log = create_logger(__name__, silent=False, to_disk=True, log_file=cfg.log_filename if cfg.if_test else [cfg.log_filename, cfg.save_root + 'log.txt']) self.sig = Signal(cfg.signal_file) self.opt = opt self.show_config() self.clas = None # load dictionary self.word2idx_dict, self.idx2word_dict = load_dict(cfg.dataset) # Dataloader try: self.train_data = GenDataIter(cfg.train_data) self.test_data = GenDataIter(cfg.test_data, if_test_data=True) except: pass try: self.train_data_list = [GenDataIter(cfg.cat_train_data.format(i)) for i in range(cfg.k_label)] self.test_data_list = [GenDataIter(cfg.cat_test_data.format(i), if_test_data=True) for i in range(cfg.k_label)] self.clas_data_list = [GenDataIter(cfg.cat_test_data.format(str(i)), if_test_data=True) for i in range(cfg.k_label)] self.train_samples_list = [self.train_data_list[i].target for i in range(cfg.k_label)] self.clas_samples_list = [self.clas_data_list[i].target for i in range(cfg.k_label)] except: pass # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() self.clas_criterion = nn.CrossEntropyLoss() # Optimizer self.clas_opt = None # Metrics self.bleu = BLEU('BLEU', gram=[2, 3, 4, 5], if_use=cfg.use_bleu) self.nll_gen = NLL('NLL_gen', if_use=cfg.use_nll_gen, gpu=cfg.CUDA) self.nll_div = NLL('NLL_div', if_use=cfg.use_nll_div, gpu=cfg.CUDA) self.self_bleu = BLEU('Self-BLEU', gram=[2, 3, 4], if_use=cfg.use_self_bleu) self.clas_acc = ACC(if_use=cfg.use_clas_acc) self.ppl = PPL(self.train_data, self.test_data, n_gram=5, if_use=cfg.use_ppl) self.all_metrics = [self.bleu, self.nll_gen, self.nll_div, self.self_bleu, self.ppl]
def leakGAN(): test_sentences = list() with codecs.open(args.test_data, 'r',encoding='utf8',errors='ignore') as f: for line in f: line = line.split(' ', 1)[1] #remove initial EOS test_sentences.append(line) real_sentences = list() with codecs.open(args.real_data,'r',encoding='utf8',errors='ignore') as f: for line in f: real_sentences.append(line) #Step 2: BLEU Score print("LSTM - LeakGAN double layer encoding") for i in range(1, args.gram + 1): bleu = BLEU(test_sentences, real_sentences, i) bleu_score = bleu.get_score(ignore=False) print("BLEU{} score:{}".format(i,bleu_score))
def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) #LSTM self.corpus = dataa.Corpus('dataset/emnlp_news/') self.lstm = LSTM.RNNModel('LSTM', len(self.corpus.dictionary), 200, 600, 3, 0.2, False) if (cfg.CUDA): self.dis.cuda() self.gen.cuda() self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.oracle_data.random_batch()['target']) # Metrics self.bleu3 = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.index_word_dict), gram=3)
class BasicInstructor: def __init__(self, opt): self.log = create_logger(__name__, silent=False, to_disk=True, log_file=cfg.log_filename if cfg.if_test else [cfg.log_filename, cfg.save_root + 'log.txt']) self.sig = Signal(cfg.signal_file) self.opt = opt self.show_config() self.clas = None # load dictionary self.word2idx_dict, self.idx2word_dict = load_dict(cfg.dataset) # Dataloader try: self.train_data = GenDataIter(cfg.train_data) self.test_data = GenDataIter(cfg.test_data, if_test_data=True) except: pass try: self.train_data_list = [ GenDataIter(cfg.cat_train_data.format(i)) for i in range(cfg.k_label) ] self.test_data_list = [ GenDataIter(cfg.cat_test_data.format(i), if_test_data=True) for i in range(cfg.k_label) ] self.clas_data_list = [ GenDataIter(cfg.cat_test_data.format(str(i)), if_test_data=True) for i in range(cfg.k_label) ] self.train_samples_list = [ self.train_data_list[i].target for i in range(cfg.k_label) ] self.clas_samples_list = [ self.clas_data_list[i].target for i in range(cfg.k_label) ] except: pass # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() self.clas_criterion = nn.CrossEntropyLoss() # Optimizer self.clas_opt = None # Metrics self.bleu = BLEU('BLEU', gram=[2, 3, 4, 5], if_use=cfg.use_bleu) self.nll_gen = NLL('NLL_gen', if_use=cfg.use_nll_gen, gpu=cfg.CUDA) self.nll_div = NLL('NLL_div', if_use=cfg.use_nll_div, gpu=cfg.CUDA) self.self_bleu = BLEU('Self-BLEU', gram=[2, 3, 4], if_use=cfg.use_self_bleu) self.clas_acc = ACC(if_use=cfg.use_clas_acc) self.ppl = PPL(self.train_data, self.test_data, n_gram=5, if_use=cfg.use_ppl) self.all_metrics = [ self.bleu, self.nll_gen, self.nll_div, self.self_bleu, self.ppl ] def _run(self): print('Nothing to run in Basic Instructor!') pass def _test(self): pass def init_model(self): if cfg.dis_pretrain: self.log.info('Load pre-trained discriminator: {}'.format( cfg.pretrained_dis_path)) self.dis.load_state_dict(torch.load(cfg.pretrained_dis_path)) if cfg.gen_pretrain: self.log.info('Load MLE pre-trained generator: {}'.format( cfg.pretrained_gen_path)) self.gen.load_state_dict(torch.load(cfg.pretrained_gen_path)) if cfg.CUDA: self.gen = self.gen.cuda() self.dis = self.dis.cuda() def train_gen_epoch(self, model, data_loader, criterion, optimizer): total_loss = 0 for i, data in enumerate(data_loader): inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() hidden = model.init_hidden(data_loader.batch_size) pred = model.forward(inp, hidden) loss = criterion(pred, target.view(-1)) self.optimize(optimizer, loss, model) total_loss += loss.item() return total_loss / len(data_loader) def train_dis_epoch(self, model, data_loader, criterion, optimizer): total_loss = 0 total_acc = 0 total_num = 0 for i, data in enumerate(data_loader): inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() pred = model.forward(inp) loss = criterion(pred, target) self.optimize(optimizer, loss, model) total_loss += loss.item() total_acc += torch.sum((pred.argmax(dim=-1) == target)).item() total_num += inp.size(0) total_loss /= len(data_loader) total_acc /= total_num return total_loss, total_acc def train_classifier(self, epochs): """ Classifier for calculating the classification accuracy metric of category text generation. Note: the train and test data for the classifier is opposite to the generator. Because the classifier is to calculate the classification accuracy of the generated samples where are trained on self.train_samples_list. Since there's no test data in synthetic data (oracle data), the synthetic data experiments doesn't need a classifier. """ import copy # Prepare data for Classifier clas_data = CatClasDataIter(self.clas_samples_list) eval_clas_data = CatClasDataIter(self.train_samples_list) max_acc = 0 best_clas = None for epoch in range(epochs): c_loss, c_acc = self.train_dis_epoch(self.clas, clas_data.loader, self.clas_criterion, self.clas_opt) _, eval_acc = self.eval_dis(self.clas, eval_clas_data.loader, self.clas_criterion) if eval_acc > max_acc: best_clas = copy.deepcopy( self.clas.state_dict()) # save the best classifier max_acc = eval_acc self.log.info( '[PRE-CLAS] epoch %d: c_loss = %.4f, c_acc = %.4f, eval_acc = %.4f, max_eval_acc = %.4f', epoch, c_loss, c_acc, eval_acc, max_acc) self.clas.load_state_dict( copy.deepcopy(best_clas)) # Reload the best classifier @staticmethod def eval_dis(model, data_loader, criterion): total_loss = 0 total_acc = 0 total_num = 0 with torch.no_grad(): for i, data in enumerate(data_loader): inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() pred = model.forward(inp) loss = criterion(pred, target) total_loss += loss.item() total_acc += torch.sum((pred.argmax(dim=-1) == target)).item() total_num += inp.size(0) total_loss /= len(data_loader) total_acc /= total_num return total_loss, total_acc @staticmethod def optimize_multi(opts, losses): for i, (opt, loss) in enumerate(zip(opts, losses)): opt.zero_grad() loss.backward(retain_graph=True if i < len(opts) - 1 else False) opt.step() @staticmethod def optimize(opt, loss, model=None, retain_graph=False): opt.zero_grad() loss.backward(retain_graph=retain_graph) if model is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.clip_norm) opt.step() def show_config(self): self.log.info(100 * '=') self.log.info('> training arguments:') for arg in vars(self.opt): self.log.info('>>> {0}: {1}'.format(arg, getattr(self.opt, arg))) self.log.info(100 * '=') def cal_metrics(self, fmt_str=False): """ Calculate metrics :param fmt_str: if return format string for logging """ with torch.no_grad(): # Prepare data for evaluation eval_samples = self.gen.sample(cfg.samples_num, 4 * cfg.batch_size) gen_data = GenDataIter(eval_samples) gen_tokens = tensor_to_tokens(eval_samples, self.idx2word_dict) gen_tokens_s = tensor_to_tokens(self.gen.sample(200, 200), self.idx2word_dict) # Reset metrics self.bleu.reset(test_text=gen_tokens, real_text=self.test_data.tokens) self.nll_gen.reset(self.gen, self.train_data.loader) self.nll_div.reset(self.gen, gen_data.loader) self.self_bleu.reset(test_text=gen_tokens_s, real_text=gen_tokens) self.ppl.reset(gen_tokens) if fmt_str: return ', '.join([ '%s = %s' % (metric.get_name(), metric.get_score()) for metric in self.all_metrics ]) else: return [metric.get_score() for metric in self.all_metrics] def cal_metrics_with_label(self, label_i): assert type(label_i) == int, 'missing label' with torch.no_grad(): # Prepare data for evaluation eval_samples = self.gen.sample(cfg.samples_num, 8 * cfg.batch_size, label_i=label_i) gen_data = GenDataIter(eval_samples) gen_tokens = tensor_to_tokens(eval_samples, self.idx2word_dict) gen_tokens_s = tensor_to_tokens( self.gen.sample(200, 200, label_i=label_i), self.idx2word_dict) clas_data = CatClasDataIter([eval_samples], label_i) # Reset metrics self.bleu.reset(test_text=gen_tokens, real_text=self.test_data_list[label_i].tokens) self.nll_gen.reset(self.gen, self.train_data_list[label_i].loader, label_i) self.nll_div.reset(self.gen, gen_data.loader, label_i) self.self_bleu.reset(test_text=gen_tokens_s, real_text=gen_tokens) self.clas_acc.reset(self.clas, clas_data.loader) self.ppl.reset(gen_tokens) return [metric.get_score() for metric in self.all_metrics] def comb_metrics(self, fmt_str=False): all_scores = [ self.cal_metrics_with_label(label_i) for label_i in range(cfg.k_label) ] all_scores = np.array( all_scores).T.tolist() # each row for each metric if fmt_str: return ', '.join([ '%s = %s' % (metric.get_name(), score) for (metric, score) in zip(self.all_metrics, all_scores) ]) return all_scores def _save(self, phase, epoch): """Save model state dict and generator's samples""" if phase != 'ADV': torch.save( self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_{}_{}_{:05d}.txt'.format( phase, cfg.samples_num, epoch) samples = self.gen.sample(5000, cfg.batch_size) write_tokens(save_sample_path, tensor_to_tokens(samples, self.idx2word_dict)) def update_temperature(self, i, N): self.gen.temperature.data = torch.Tensor( [get_fixed_temperature(cfg.temperature, i, N, cfg.temp_adpt)]) if cfg.CUDA: self.gen.temperature.data = self.gen.temperature.data.cuda()
def evaluate(self): ''' Evaluates the generator using various metrics. ''' bleu = BLEU(model=self.G) perplexity = Perplexity(model=self.G) return bleu, perplexity
class LeakGANInstructor(BasicInstructor): def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.oracle_data.random_batch()['target']) # Metrics self.bleu3 = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.index_word_dict), gram=3) def _run(self): for inter_num in range(cfg.inter_epoch): self.log.info('>>> Interleaved Round %d...' % inter_num) self.sig.update() # update signal if self.sig.pre_sig: # =====DISCRIMINATOR PRE-TRAINING===== if not cfg.dis_pretrain: self.log.info('Starting Discriminator Training...') self.train_discriminator(cfg.d_step, cfg.d_epoch) if cfg.if_save and not cfg.if_test: torch.save(self.dis.state_dict(), cfg.pretrained_dis_path) print('Save pre-trained discriminator: {}'.format( cfg.pretrained_dis_path)) # =====GENERATOR MLE TRAINING===== if not cfg.gen_pretrain: self.log.info('Starting Generator MLE Training...') self.pretrain_generator(cfg.MLE_train_epoch) if cfg.if_save and not cfg.if_test: torch.save(self.gen.state_dict(), cfg.pretrained_gen_path) print('Save pre-trained generator: {}'.format( cfg.pretrained_gen_path)) else: self.log.info( '>>> Stop by pre_signal! Skip to adversarial training...') break # =====ADVERSARIAL TRAINING===== self.log.info('Starting Adversarial Training...') self.log.info('Initial generator: %s' % (str(self.cal_metrics(fmt_str=True)))) for adv_epoch in range(cfg.ADV_train_epoch): self.log.info('-----\nADV EPOCH %d\n-----' % adv_epoch) self.sig.update() if self.sig.adv_sig: self.adv_train_generator(cfg.ADV_g_step) # Generator self.train_discriminator(cfg.ADV_d_step, cfg.ADV_d_epoch, 'ADV') # Discriminator if adv_epoch % cfg.adv_log_step == 0: if cfg.if_save and not cfg.if_test: self._save('ADV', adv_epoch) else: self.log.info( '>>> Stop by adv_signal! Finishing adversarial training...' ) break def _test(self): print('>>> Begin test...') self._run() pass def pretrain_generator(self, epochs): """ Max Likelihood Pretraining for the gen - gen_opt: [mana_opt, work_opt] """ for epoch in range(epochs): self.sig.update() if self.sig.pre_sig: pre_mana_loss = 0 pre_work_loss = 0 # =====Train===== for i, data in enumerate(self.oracle_data.loader): inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() mana_loss, work_loss = self.gen.pretrain_loss( target, self.dis) self.optimize_multi(self.gen_opt, [mana_loss, work_loss]) pre_mana_loss += mana_loss.data.item() pre_work_loss += work_loss.data.item() pre_mana_loss = pre_mana_loss / len(self.oracle_data.loader) pre_work_loss = pre_work_loss / len(self.oracle_data.loader) # =====Test===== if epoch % cfg.pre_log_step == 0: self.log.info( '[MLE-GEN] epoch %d : pre_mana_loss = %.4f, pre_work_loss = %.4f, %s' % (epoch, pre_mana_loss, pre_work_loss, self.cal_metrics(fmt_str=True))) if cfg.if_save and not cfg.if_test: self._save('MLE', epoch) else: self.log.info( '>>> Stop by pre signal, skip to adversarial training...') break def adv_train_generator(self, g_step, current_k=0): """ The gen is trained using policy gradients, using the reward from the discriminator. Training is done for num_batches batches. """ rollout_func = rollout.ROLLOUT(self.gen, cfg.CUDA) adv_mana_loss = 0 adv_work_loss = 0 for step in range(g_step): with torch.no_grad(): gen_samples = self.gen.sample( cfg.batch_size, cfg.batch_size, self.dis, train=True) # !!! train=True, the only place inp, target = self.gen_data.prepare(gen_samples, gpu=cfg.CUDA) # =====Train===== rewards = rollout_func.get_reward_leakgan( target, cfg.rollout_num, self.dis, current_k).cpu() # reward with MC search mana_loss, work_loss = self.gen.adversarial_loss( target, rewards, self.dis) # update parameters self.optimize_multi(self.gen_opt, [mana_loss, work_loss]) adv_mana_loss += mana_loss.data.item() adv_work_loss += work_loss.data.item() # =====Test===== self.log.info( '[ADV-GEN] adv_mana_loss = %.4f, adv_work_loss = %.4f, %s' % (adv_mana_loss / g_step, adv_work_loss / g_step, self.cal_metrics(fmt_str=True))) def train_discriminator(self, d_step, d_epoch, phrase='MLE'): """ Training the discriminator on real_data_samples (positive) and generated samples from gen (negative). Samples are drawn d_step times, and the discriminator is trained for d_epoch d_epoch. """ for step in range(d_step): # prepare loader for training pos_samples = self.oracle_data.target neg_samples = self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis) self.dis_data.reset(pos_samples, neg_samples) for epoch in range(d_epoch): # =====Train===== d_loss, train_acc = self.train_dis_epoch( self.dis, self.dis_data.loader, self.dis_criterion, self.dis_opt) # =====Test===== self.log.info( '[%s-DIS] d_step %d: d_loss = %.4f, train_acc = %.4f,' % (phrase, step, d_loss, train_acc)) def cal_metrics(self, fmt_str=False): self.gen_data.reset( self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis)) self.bleu3.test_text = tensor_to_tokens(self.gen_data.target, self.index_word_dict) bleu3_score = self.bleu3.get_score(ignore=False) with torch.no_grad(): gen_nll = 0 for data in self.oracle_data.loader: inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() loss = self.gen.batchNLLLoss(target, self.dis) gen_nll += loss.item() gen_nll /= len(self.oracle_data.loader) if fmt_str: return 'BLEU-3 = %.4f, gen_NLL = %.4f,' % (bleu3_score, gen_nll) return bleu3_score, gen_nll def _save(self, phrase, epoch): torch.save( self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phrase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_{}_{:05d}.txt'.format( phrase, epoch) samples = self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis) write_tokens(save_sample_path, tensor_to_tokens(samples, self.index_word_dict))
class Model(): def __init__(self, encoder_layer_num, decoder_layer_num, hidden_dim, batch_size, learning_rate, dropout, init_train = True): self.encoder_layer_num = encoder_layer_num self.decoder_layer_num = decoder_layer_num self.hidden_dim = hidden_dim self.batch_size = batch_size self.learning_rate = learning_rate self.dropout = dropout self.init_train = init_train #---------fix---------- self.vocab_size = cfg.vocab_size self.max_length = cfg.max_length self.embedding_matrix = make_embedding_matrix(cfg.all_captions) self.SOS_token = cfg.SOS_token self.EOS_token = cfg.EOS_token self.idx2word_dict = load_dict() #---------------------- self.bleu = BLEU('BLEU', gram=[2,3,4,5]) #self.bleu.reset(test_text = gen_tokens, real_text = self.test_data.tokens) if init_train: self._init_train() train_week_stock, train_month_stock, t_month_stock,train_input_cap_vector, train_output_cap_vector = load_training_data() self.train_data = batch_generator(train_week_stock, train_month_stock, t_month_stock,train_input_cap_vector, train_output_cap_vector, self.batch_size) self.total_iter = len(train_input_cap_vector) self._init_eval() val_week_stock, val_month_stock, val_t_month_stock,val_input_cap_vector, val_output_cap_vector = load_val_data() self.val_data = batch_generator(val_week_stock, val_month_stock, val_t_month_stock,val_input_cap_vector, val_output_cap_vector, self.batch_size) self.val_total_iter = len(val_input_cap_vector) # gpu 탄력적으로 사용. def gpu_session_config(self): config = tf.ConfigProto() config.gpu_options.allow_growth = True return config def _init_train(self): self.train_graph = tf.Graph() with self.train_graph.as_default(): with tf.variable_scope('encoder_input'): self.week_input = tf.placeholder(tf.float64, shape= [None, 7], name='week_input') self.month_input = tf.placeholder(tf.float64, shape=[None, 28], name='month_input') self.t_month_input = tf.placeholder(tf.float64, shape=[None, 84], name='t_month_input') with tf.variable_scope("decoder_input"): self.decoder_input = tf.placeholder(tf.int32, [None, self.max_length], name='input') self.decoder_target = tf.placeholder(tf.int32, [None, self.max_length], name='target') self.decoder_targets_length = tf.placeholder(tf.int32, shape = [self.batch_size, ], name = 'targets_length') encoded_output, encoded_state = encoder_module(self.week_input, self.month_input, self.t_month_input, self.encoder_layer_num, self.decoder_layer_num, self.hidden_dim) decoder_output, decoder_state = decoder_module(encoded_state, encoded_output, self.decoder_input, self.decoder_targets_length, self.embedding_matrix, self.decoder_layer_num, self.hidden_dim, self.max_length, self.vocab_size, self.batch_size, self.dropout, self.SOS_token, self.EOS_token, train = True) self.logits = decoder_output.rnn_output # traning output self.sample_id = decoder_output.sample_id self._init_optimizer() self.train_init = tf.global_variables_initializer() self.train_saver = tf.train.Saver() self.train_session = tf.Session(graph=self.train_graph, config = self.gpu_session_config()) def _init_optimizer(self): #loss mask mask = tf.cast(tf.sequence_mask(self.decoder_targets_length, self.max_length),tf.float64) self.loss = tf.contrib.seq2seq.sequence_loss(logits= self.logits, targets = self.decoder_target, weights = mask, average_across_timesteps = True, average_across_batch = True) #tf.summary.scalar('loss', self.loss) #self.summary_op = tf.summary.merge_all() params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients,5.0) self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(zip(clipped_gradients, params)) # batch 단위로 계산 def cal_metrics(self, infer_text, real_text): self.bleu.reset(infer_text = infer_text, real_text = real_text) return self.bleu.get_score() # bleu, greedy/beam search init def _init_eval(self): self.eval_graph = tf.Graph() with self.eval_graph.as_default(): self.eval_week_input = tf.placeholder(tf.float64, shape= [None, 7]) self.eval_month_input = tf.placeholder(tf.float64, shape=[None, 28]) self.eval_t_month_input = tf.placeholder(tf.float64, shape=[None, 84]) self.eval_decoder_targets_length = tf.placeholder(tf.int32, shape = [self.batch_size, ]) eval_encoded_output, eval_encoded_state = encoder_module(self.eval_week_input, self.eval_month_input, self.eval_t_month_input, self.encoder_layer_num, self.decoder_layer_num, self.hidden_dim) self.eval_decoder_output, eval_decoder_state = decoder_module(eval_encoded_state, eval_encoded_output, None, self.eval_decoder_targets_length, self.embedding_matrix, self.decoder_layer_num, self.hidden_dim, self.max_length, self.vocab_size, self.batch_size, self.dropout, self.SOS_token, self.EOS_token, train = False) self.predicted_ids = tf.identity(self.eval_decoder_output.predicted_ids) self.eval_saver = tf.train.Saver() self.eval_session = tf.Session(graph=self.eval_graph,config=self.gpu_session_config()) def train_epoch(self, epochs): if not self.init_train: raise Exception('Train graph is not inited') with self.train_graph.as_default(): if os.path.isfile(cfg.save_path + '.meta'): print("##########################") print('# Model restore.. #') print("##########################") self.train_saver.restore(self.train_session, cfg.save_path) else: self.train_session.run(self.train_init) total_loss = 0 total_step = 0 start_time =time.time() for e in range(epochs): for step in range(self.total_iter// self.batch_size): data = next(self.train_data) week_stock = data['week_stock'] month_stock = data['month_stock'] t_month_stock = data['t_month_stock'] decoder_input = data['decoder_input'] decoder_target = data['decoder_target'] batch_seq = batch_seq_len(data['decoder_target']) _, loss, sample_id = self.train_session.run([self.optimizer, self.loss, self.sample_id], feed_dict = {self.week_input : week_stock, self.month_input : month_stock, self.t_month_input : t_month_stock, self.decoder_input : decoder_input, self.decoder_target : decoder_target, self.decoder_targets_length : batch_seq}) # total_loss += loss # total_step += self.total_iter # loss = total_loss/total_step end = time.time() print('epoch: {}|{} minibatch loss: {:.6f} Time: {:.1f} min'.format(e+1, epochs, loss, (end-start_time)/60 )) if e % 50 ==0: self.train_saver.save(self.train_session, cfg.save_path) #랜덤 sid 선택, training output_text sid = random.randint(0, self.batch_size-1) target_text = decode_text(decoder_target[sid],self.idx2word_dict) output_text = decode_text(sample_id[sid],self.idx2word_dict) print('============ training sample text =============') print('training_target :' + target_text) print('training_output :' + output_text) print('===============================================') self.eval() def eval(self): with self.eval_graph.as_default(): self.eval_saver.restore(self.eval_session, cfg.save_path) all_bleu = [0] * 4 eval_mask_weights = tf.ones(shape=[self.batch_size, self.max_length],dtype=tf.float64) for step in range(self.val_total_iter//self.batch_size): data = next(self.val_data) week_stock = data['week_stock'] month_stock = data['month_stock'] t_month_stock = data['t_month_stock'] batch_seq = batch_seq_len(data['decoder_target']) #beam search_output beam_output = self.eval_session.run([self.predicted_ids], feed_dict = {self.eval_week_input : week_stock, self.eval_month_input : month_stock, self.eval_t_month_input : t_month_stock, self.eval_decoder_targets_length : batch_seq }) target_text = idx_to_text(data['decoder_input'][:,1:],self.idx2word_dict) target_text = remove_sent_pad(target_text) beam_output = np.squeeze(np.array(beam_output),axis=0) output_text = idx_to_text(beam_output[:,:,0], self.idx2word_dict) bleu_score = self.cal_metrics(target_text, output_text) for idx,score in enumerate(bleu_score): all_bleu[idx] += score print('================ BLEU score ================') for idx, bleu in enumerate(bleu_score):#2,3,4,5 print('BLEU-{} : {}'.format(idx+2, bleu)) sid = random.randint(0, self.batch_size-1) target_text = decode_text(data['decoder_target'][sid],self.idx2word_dict) output_text = decode_text(beam_output[sid,:,0],self.idx2word_dict) print('============= Beam search text =============') print('infer_target : ' + target_text) print('beam_search : ' + output_text) print('============================================')
class LeakGANInstructor(BasicInstructor): def __init__(self, opt): super(LeakGANInstructor, self).__init__(opt) # generator, discriminator self.gen = LeakGAN_G(cfg.gen_embed_dim, cfg.gen_hidden_dim, cfg.vocab_size, cfg.max_seq_len, cfg.padding_idx, cfg.goal_size, cfg.step_size, cfg.CUDA) self.dis = LeakGAN_D(cfg.dis_embed_dim, cfg.vocab_size, cfg.padding_idx, gpu=cfg.CUDA) #LSTM self.corpus = dataa.Corpus('dataset/emnlp_news/') self.lstm = LSTM.RNNModel('LSTM', len(self.corpus.dictionary), 200, 600, 3, 0.2, False) if (cfg.CUDA): self.dis.cuda() self.gen.cuda() self.init_model() # optimizer mana_params, work_params = self.gen.split_params() mana_opt = optim.Adam(mana_params, lr=cfg.gen_lr) work_opt = optim.Adam(work_params, lr=cfg.gen_lr) self.gen_opt = [mana_opt, work_opt] self.dis_opt = optim.Adam(self.dis.parameters(), lr=cfg.dis_lr) # Criterion self.mle_criterion = nn.NLLLoss() self.dis_criterion = nn.CrossEntropyLoss() # DataLoader self.gen_data = GenDataIter( self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis)) self.dis_data = DisDataIter(self.gen_data.random_batch()['target'], self.oracle_data.random_batch()['target']) # Metrics self.bleu3 = BLEU(test_text=tensor_to_tokens(self.gen_data.target, self.index_word_dict), real_text=tensor_to_tokens(self.test_data.target, self.index_word_dict), gram=3) def _run(self): for inter_num in range(cfg.inter_epoch): self.log.info('>>> Interleaved Round %d...' % inter_num) self.sig.update() # update signal if self.sig.pre_sig: # =====DISCRIMINATOR PRE-TRAINING===== if not cfg.dis_pretrain: self.log.info('Starting Discriminator Training...') self.train_discriminator(cfg.d_step, cfg.d_epoch) if cfg.if_save and not cfg.if_test: torch.save(self.dis.state_dict(), cfg.pretrained_dis_path) print('Save pre-trained discriminator: {}'.format( cfg.pretrained_dis_path)) # =====GENERATOR MLE TRAINING===== if not cfg.gen_pretrain: self.log.info('Starting Generator MLE Training...') self.pretrain_generator(cfg.MLE_train_epoch) if cfg.if_save and not cfg.if_test: torch.save(self.gen.state_dict(), cfg.pretrained_gen_path) print('Save pre-trained generator: {}'.format( cfg.pretrained_gen_path)) else: self.log.info( '>>> Stop by pre_signal! Skip to adversarial training...') break # =====ADVERSARIAL TRAINING===== self.log.info('Starting Adversarial Training...') self.log.info('Initial generator: %s' % (str(self.cal_metrics(fmt_str=True)))) for adv_epoch in range(cfg.ADV_train_epoch): self.log.info('-----\nADV EPOCH %d\n-----' % adv_epoch) self.sig.update() if self.sig.adv_sig: self.adv_train_generator(cfg.ADV_g_step) # Generator self.train_discriminator(cfg.ADV_d_step, cfg.ADV_d_epoch, 'ADV') # Discriminator if adv_epoch % cfg.adv_log_step == 0: if cfg.if_save and not cfg.if_test: self._save('ADV', adv_epoch) else: self.log.info( '>>> Stop by adv_signal! Finishing adversarial training...' ) break def string2bins(self, bit_string, n_bins): n_bits = int(math.log(n_bins, 2)) return [ bit_string[i:i + n_bits] for i in range(0, len(bit_string), n_bits) ] def LSTM_layer_1(self, intermediate_file, bins_num): print('>>> Begin test...') print('Begin with LSTM Layer') #First layer- LSTM layer epoch_start_time = time.time() seed = 1111 data_root = './decode/' #Reproducibility torch.manual_seed(seed) if cfg.CUDA: torch.cuda.manual_seed(seed) with open("leakGAN_instructor/real_data/emnlp_news.pt", 'rb') as f: self.lstm = torch.load(f) if cfg.CUDA: self.lstm.cuda() emnlp_data = 'dataset/emnlp_news/' corpus = dataa.Corpus(emnlp_data) ntokens = len(corpus.dictionary) idx2word_file = data_root + "idx2word_1.txt" word2idx_file = data_root + "word2idx_1.txt" with open(idx2word_file, "wb") as fp: #Pickling pickle.dump(corpus.dictionary.idx2word, fp) with open(word2idx_file, "wb") as fp: #Pickling pickle.dump(corpus.dictionary.word2idx, fp) hidden = self.lstm.init_hidden(1) input = torch.randint(ntokens, (1, 1), dtype=torch.long) if cfg.CUDA: input.data = input.data.cuda() print("Finished Initializing LSTM Model") #Step 1: Get secret data secret_file = open("leakGAN_instructor/real_data/secret_file.txt", 'r') secret_data = secret_file.read() #Step 2: Compress string into binary string bit_string = ''.join( bin(ord(letter))[2:].zfill(8) for letter in secret_data) #print(bit_string) bit_string = '111011100101000111000011110111101111110111000110011010110110' #In the first step we will use 256 bins (8 bit representation each) to convert so that we can convert 64 bits into 8 word #Step 3: Divide into bins secret_text = [ int(i, 2) for i in self.string2bins(bit_string, bins_num) ] #convert to bins #Step 4: Divide vocabulary into bins, zero words not in the bin if bins_num >= 2: tokens = list(range(ntokens)) #indecies of words random.shuffle(tokens) #randomize #Words in each bin words_in_bin = int(ntokens / bins_num) #leftovers should be also included in the leftover = int(ntokens % bins_num) bins = [ tokens[i:i + words_in_bin] for i in range(0, ntokens - leftover, words_in_bin) ] # words to keep in each bin for i in range(len(bins)): if (i == leftover): break bins[i].append(tokens[i + words_in_bin * bins_num]) print("Len of bins in 1st layer: {}".format(len(bins))) #save bins into key 1 key1 = data_root + "lstm_key1.txt" with open(key1, "wb") as fp: #Pickling pickle.dump(bins, fp) zero = [list(set(tokens) - set(bin_)) for bin_ in bins] print('Finished Initializing First LSTM Layer') print('time: {:5.2f}s'.format(time.time() - epoch_start_time)) print('-' * 89) intermediate_file = data_root + intermediate_file with open(intermediate_file, 'w') as outf: w = 0 i = 1 temperature = 1.5 bin_sequence_length = len(secret_text[:]) # 85 print("bin sequence length", bin_sequence_length) #32 while i <= bin_sequence_length: epoch_start_time = time.time() output, hidden = self.lstm(input, hidden) zero_index = zero[secret_text[:][i - 1]] zero_index = torch.LongTensor(zero_index) word_weights = output.squeeze().data.div( temperature).exp().cpu() word_weights.index_fill_(0, zero_index, 0) word_idx = torch.multinomial(word_weights, 1)[0] input.data.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] i += 1 w += 1 word = word.encode('ascii', 'ignore').decode('ascii') outf.write(word + ' ') print("Generated intermediate short steganographic text") print("Intermediate text saved in following file: {}".format( intermediate_file)) def LSTM_layer_2(self, secret_file, final_file, bins_num): print('Final LSTM Layer') #First layer- LSTM layer data_root = './decode/' epoch_start_time = time.time() seed = 1111 #Reproducibility torch.manual_seed(seed) if cfg.CUDA: torch.cuda.manual_seed(seed) with open("leakGAN_instructor/real_data/emnlp_news.pt", 'rb') as f: self.lstm = torch.load(f) if cfg.CUDA: self.lstm.cuda() emnlp_data = 'dataset/emnlp_news/' corpus = dataa.Corpus(emnlp_data) #save dictionary idx2word_file = data_root + "idx2word_2.txt" word2idx_file = data_root + "word2idx_2.txt" with open(idx2word_file, "wb") as fp: #Pickling pickle.dump(corpus.dictionary.idx2word, fp) with open(word2idx_file, "wb") as fp: #Pickling pickle.dump(corpus.dictionary.word2idx, fp) ntokens = len(corpus.dictionary) hidden = self.lstm.init_hidden(1) input = torch.randint(ntokens, (1, 1), dtype=torch.long) if cfg.CUDA: input.data = input.data.cuda() print("Finished Initializing LSTM Model") #Step 1: Get secret data secret_file = open(data_root + secret_file, 'r') secret_data = secret_file.read().split() #Step 2: Compress string into binary string bit_string = '' for data in secret_data: print("Data: {}".format(data)) idWord = corpus.dictionary.word2idx[data] bit_string += '{0:{fill}13b}'.format(int(idWord), fill='0') #print(ntokens) print("Bit String: {}".format(bit_string)) print("Length of Bit String: {}".format(len(bit_string))) #print(bit_string) #bit_string = '111011100101000111000011110111101111110111000110011010110110' #In the first step we will use 256 bins (8 bit representation each) to convert so that we can convert 64 bits into 8 word #Step 3: Divide into bins secret_text = [ int(i, 2) for i in self.string2bins(bit_string, bins_num) ] #convert to bins #Step 4: Divide vocabulary into bins, zero words not in the bin if bins_num >= 2: tokens = list(range(ntokens)) #indecies of words random.shuffle(tokens) #randomize #Words in each bin words_in_bin = int(ntokens / bins_num) #leftovers should be also included in the leftover = int(ntokens % bins_num) bins = [ tokens[i:i + words_in_bin] for i in range(0, ntokens - leftover, words_in_bin) ] # words to keep in each bin for i in range(0, leftover): bins[i].append(tokens[i + words_in_bin * bins_num]) #save bins into key 1 key1 = data_root + "lstm_key2.txt" with open(key1, "wb") as fp: #Pickling pickle.dump(bins, fp) zero = [list(set(tokens) - set(bin_)) for bin_ in bins] print('Finished Initializing Second LSTM Layer') print('time: {:5.2f}s'.format(time.time() - epoch_start_time)) print('-' * 89) final_file = data_root + final_file with open(final_file, 'w') as outf: w = 0 i = 1 temperature = 1.5 bin_sequence_length = len(secret_text[:]) # 85 print("bin sequence length", bin_sequence_length) #32 while i <= bin_sequence_length: epoch_start_time = time.time() output, hidden = self.lstm(input, hidden) zero_index = zero[secret_text[:][i - 1]] zero_index = torch.LongTensor(zero_index) word_weights = output.squeeze().data.div( temperature).exp().cpu() word_weights.index_fill_(0, zero_index, 0) word_idx = torch.multinomial(word_weights, 1)[0] input.data.fill_(word_idx) word = corpus.dictionary.idx2word[word_idx] i += 1 w += 1 word = word.encode('ascii', 'ignore').decode('ascii') outf.write(word + ' ') print("Generated final steganographic text") print("Final text saved in following file: {}".format( str(data_root + final_file))) def leakGAN_layer(self, secret_file, final_file, bins_num): #Second Layer = LeakGAN layer print('>>> Begin Second Layer...') data_root = './decode/' torch.nn.Module.dump_patches = True epoch_start_time = time.time() # Set the random seed manually for reproducibility. seed = 1111 #Step 1: load the most accurate model with open("leakGAN_instructor/real_data/gen_ADV_00028.pt", 'rb') as f: self.gen.load_state_dict(torch.load(f)) print("Finish Loading") self.gen.eval() #Step 1: Get Intermediate text secret_file = data_root + secret_file secret_file = open(secret_file, 'r') secret_data = secret_file.read().split() #Step 2: Compress string into binary string bit_string = '' #You need LSTM Corpus for that emnlp_data = 'dataset/emnlp_news/' corpus = dataa.Corpus(emnlp_data) for data in secret_data: print("Data: {}".format(data)) idWord = corpus.dictionary.word2idx[data] bit_string += '{0:{fill}13b}'.format(int(idWord), fill='0') secret_text = [ int(i, 2) for i in self.string2bins(bit_string, bins_num) ] #convert to bins corpus_leak = self.index_word_dict if bins_num >= 2: ntokens = len(corpus_leak) tokens = list(range(ntokens)) # * args.replication_factor #print(ntokens) random.shuffle(tokens) #Words in each bin words_in_bin = int(ntokens / bins_num) #leftovers should be also included in the leftover = int(ntokens % bins_num) bins = [ tokens[i:i + words_in_bin] for i in range(0, ntokens - leftover, words_in_bin) ] # words to keep in each bin for i in range(0, leftover): bins[i].append(tokens[i + words_in_bin * bins_num]) #save bins into leakGAN key key2 = data_root + 'leakGAN_key.txt' with open(key2, "wb") as fp: #Pickling pickle.dump(bins, fp) zero = [list(set(tokens) - set(bin_)) for bin_ in bins] print('Finished Initializing Second LeakGAN Layer') print('time: {:5.2f}s'.format(time.time() - epoch_start_time)) print('-' * 89) out_file = data_root + final_file w = 0 i = 1 bin_sequence_length = len(secret_text[:]) print("bin sequence length", bin_sequence_length) batch_size = cfg.batch_size seq_len = cfg.max_seq_len feature_array = torch.zeros( (batch_size, seq_len + 1, self.gen.goal_out_size)) goal_array = torch.zeros( (batch_size, seq_len + 1, self.gen.goal_out_size)) leak_out_array = torch.zeros((batch_size, seq_len + 1, cfg.vocab_size)) samples = torch.zeros(batch_size, seq_len + 1).long() work_hidden = self.gen.init_hidden(batch_size) mana_hidden = self.gen.init_hidden(batch_size) leak_inp = torch.LongTensor([cfg.start_letter] * batch_size) real_goal = self.gen.goal_init[:batch_size, :] if cfg.CUDA: feature_array = feature_array.cuda() goal_array = goal_array.cuda() leak_out_array = leak_out_array.cuda() goal_array[:, 0, :] = real_goal # g0 = goal_init if_sample = True no_log = False index = cfg.start_letter while i <= seq_len: dis_inp = torch.zeros(batch_size, bin_sequence_length).long() if i > 1: dis_inp[:, :i - 1] = samples[:, :i - 1] # cut sentences leak_inp = samples[:, i - 2] if torch.cuda.is_available(): dis_inp = dis_inp.cuda() leak_inp = leak_inp.cuda() feature = self.dis.get_feature(dis_inp).unsqueeze(0) #print(feature) feature_array[:, i - 1, :] = feature.squeeze(0) out, cur_goal, work_hidden, mana_hidden = self.gen(index, leak_inp, work_hidden, mana_hidden, feature, real_goal, no_log=no_log, train=False) leak_out_array[:, i - 1, :] = out goal_array[:, i, :] = cur_goal.squeeze(1) if i > 0 and i % self.gen.step_size == 0: real_goal = torch.sum(goal_array[:, i - 3:i + 1, :], dim=1) if i / self.gen.step_size == 1: real_goal += self.gen.goal_init[:batch_size, :] # Sample one token if not no_log: out = torch.exp(out) zero_index = zero[secret_text[:][ i - 1]] #indecies that has to be zeroed, as they are not in the current bin #zero_index.append(0) zero_index = torch.LongTensor(zero_index) if cfg.CUDA: zero_index = zero_index.cuda() temperature = 1.5 word_weights = out word_weights = word_weights.index_fill_( 1, zero_index, 0) #make all the indecies zero if they are not in the bin word_weights = torch.multinomial(word_weights, 1).view( -1) #choose one word with highest probability for each sample #print("Out after: {}".format(word_weights)) samples[:, i] = word_weights leak_inp = word_weights i += 1 w += 1 leak_out_array = leak_out_array[:, :seq_len, :] tokens = [] write_tokens(out_file, tensor_to_tokens(samples, self.index_word_dict)) print("Generated final steganographic text") print("Final steganographic text saved in following file: {}".format( out_file)) def _test_2_layers(self): self.LSTM_layer_1("intermediate.txt", 4096) if cfg.leakGAN: self.leakGAN_layer("intermediate.txt", "final_leakgan.txt", 4) else: self.LSTM_layer_2("intermediate.txt", "final_lstm.txt", 4) def _test(self): print('>>> Begin test...') def pretrain_generator(self, epochs): """ Max Likelihood Pretraining for the gen - gen_opt: [mana_opt, work_opt] """ for epoch in range(epochs): self.sig.update() if self.sig.pre_sig: pre_mana_loss = 0 pre_work_loss = 0 # =====Train===== for i, data in enumerate(self.oracle_data.loader): inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() mana_loss, work_loss = self.gen.pretrain_loss( target, self.dis) self.optimize_multi(self.gen_opt, [mana_loss, work_loss]) pre_mana_loss += mana_loss.data.item() pre_work_loss += work_loss.data.item() pre_mana_loss = pre_mana_loss / len(self.oracle_data.loader) pre_work_loss = pre_work_loss / len(self.oracle_data.loader) # =====Test===== if epoch % cfg.pre_log_step == 0: self.log.info( '[MLE-GEN] epoch %d : pre_mana_loss = %.4f, pre_work_loss = %.4f, %s' % (epoch, pre_mana_loss, pre_work_loss, self.cal_metrics(fmt_str=True))) if cfg.if_save and not cfg.if_test: self._save('MLE', epoch) else: self.log.info( '>>> Stop by pre signal, skip to adversarial training...') break def adv_train_generator(self, g_step, current_k=0): """ The gen is trained using policy gradients, using the reward from the discriminator. Training is done for num_batches batches. """ rollout_func = rollout.ROLLOUT(self.gen, cfg.CUDA) adv_mana_loss = 0 adv_work_loss = 0 for step in range(g_step): with torch.no_grad(): gen_samples = self.gen.sample( cfg.batch_size, cfg.batch_size, self.dis, train=True) # !!! train=True, the only place inp, target = self.gen_data.prepare(gen_samples, gpu=cfg.CUDA) # =====Train===== rewards = rollout_func.get_reward_leakgan( target, cfg.rollout_num, self.dis, current_k).cpu() # reward with MC search mana_loss, work_loss = self.gen.adversarial_loss( target, rewards, self.dis) # update parameters self.optimize_multi(self.gen_opt, [mana_loss, work_loss]) adv_mana_loss += mana_loss.data.item() adv_work_loss += work_loss.data.item() # =====Test===== self.log.info( '[ADV-GEN] adv_mana_loss = %.4f, adv_work_loss = %.4f, %s' % (adv_mana_loss / g_step, adv_work_loss / g_step, self.cal_metrics(fmt_str=True))) def train_discriminator(self, d_step, d_epoch, phrase='MLE'): """ Training the discriminator on real_data_samples (positive) and generated samples from gen (negative). Samples are drawn d_step times, and the discriminator is trained for d_epoch d_epoch. """ for step in range(d_step): # prepare loader for training pos_samples = self.oracle_data.target neg_samples = self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis) self.dis_data.reset(pos_samples, neg_samples) for epoch in range(d_epoch): # =====Train===== d_loss, train_acc = self.train_dis_epoch( self.dis, self.dis_data.loader, self.dis_criterion, self.dis_opt) # =====Test===== self.log.info( '[%s-DIS] d_step %d: d_loss = %.4f, train_acc = %.4f,' % (phrase, step, d_loss, train_acc)) def cal_metrics(self, fmt_str=False): self.gen_data.reset( self.gen.sample(cfg.samples_num, cfg.batch_size, self.dis)) self.bleu3.test_text = tensor_to_tokens(self.gen_data.target, self.index_word_dict) bleu3_score = self.bleu3.get_score(ignore=False) with torch.no_grad(): gen_nll = 0 for data in self.oracle_data.loader: inp, target = data['input'], data['target'] if cfg.CUDA: inp, target = inp.cuda(), target.cuda() loss = self.gen.batchNLLLoss(target, self.dis) gen_nll += loss.item() gen_nll /= len(self.oracle_data.loader) if fmt_str: return 'BLEU-3 = %.4f, gen_NLL = %.4f,' % (bleu3_score, gen_nll) return bleu3_score, gen_nll def _save(self, phrase, epoch): torch.save( self.gen.state_dict(), cfg.save_model_root + 'gen_{}_{:05d}.pt'.format(phrase, epoch)) save_sample_path = cfg.save_samples_root + 'samples_{}_{:05d}.txt'.format( phrase, epoch) samples = self.gen.sample(cfg.batch_size, cfg.batch_size, self.dis) write_tokens(save_sample_path, tensor_to_tokens(samples, self.index_word_dict))