def learn_lm(weighted_strings, weighted_string_pairs): result = lm.LM() tmp = [] for ((s1, w1), (s2, w2)) in weighted_string_pairs: tmp = tmp + [(s1, w1), (s2, w2)] result.learn_lm(weighted_strings + tmp) return result
def setupLM(self): # language model self.lm = L.LM( len(self.ds.char2id) + 2, self.ds.id2char, charVecPath, uniProbDictPath) self.lm.bowIndice = len(self.ds.char2id) self.lm.eowIndice = self.lm.bowIndice + 1
def __init__(self): self.ds_train = dataset.Dataset(config.trainPath, config.dictPath) self.ds_valid = dataset.Dataset(config.validPath, config.dictPath) self.ds_test = dataset.Dataset(config.testPath, config.dictPath) vocSize = len(self.ds_train.word2id) maxSeqLen = max([len(idLine) for idLine in self.ds_train.idData]) padId = self.ds_train.word2id['<PAD>'] self.model = lm.LM(vocSize, maxSeqLen, padId) # vocSize+1 for padding indice self.loss_log = logger.Logger('../result/loss.log') self.eval_log = logger.Logger('../result/eval.log')
def main(lm_opt): model_prefix = ['best_lm'] dataset = ['test'] lm_data, lm_vocab = load_datasets(lm_opt, dataset=dataset) lm_opt.vocab_size = lm_vocab.vocab_size logger.info('Loading data completed') init_scale = lm_opt.init_scale sess_config = common_utils.get_tf_sess_config(lm_opt) logger.info('Starting TF Session...') with tf.Session(config=sess_config) as sess: logger.info('Creating model...') init_scale = lm_opt.init_scale logger.debug('- Creating initializer ({} to {})'.format( -init_scale, init_scale)) initializer = tf.random_uniform_initializer(-init_scale, init_scale) logger.debug('- Creating training LM...') with tf.variable_scope('LM', reuse=None, initializer=initializer): model = lm.LM(lm_opt, is_training=False) logger.debug('Trainable variables:') logger.debug('Trainable variables:') for v in tf.trainable_variables(): logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device)) logger.info('Initializing vairables...') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() states = {} for p in model_prefix: states[p] = common_utils.get_initial_training_state() _, success = resume_many_states(lm_opt.output_dir, sess, saver, states, model_prefix) if not success: logger.error('Failed to load the model. Testing aborted.') return logger.info('Testing...') token_loss = None if opt.out_token_loss_file is not None: token_loss = [] ppl, _ = run_epoch(sess, model, lm_data['test'], lm_opt, token_loss=token_loss) logger.info('PPL = {}'.format(ppl)) if token_loss is not None: logger.info('Writing token loss...') token_loss_path = os.path.join(opt.output_dir, opt.out_token_loss_file) with open(token_loss_path, 'w') as ofp: for p in token_loss: ofp.write("{}\t{}\n".format(lm_vocab.i2w(p[0]), p[1]))
import torch import torch.optim as optim import lm as L # 作成したlm.py import dataset # 作成したdataset.py maxEpoch = 10 # 最大学習回数 dictPath = '../model/dicts.pickle' modelPath = '../model/lm_%d.model' trainDataPath = '../data/ptb.train.txt' ds = dataset.Dataset(trainDataPath) ds.save(dictPath) lm = L.LM(vocSize=len(ds.word2id)) lm.train() opt = optim.Adam(lm.parameters()) for ep in range(maxEpoch): accLoss = 0 for idLine in ds.idData: opt.zero_grad() loss = lm.getLoss(idLine) loss.backward() opt.step() accLoss += loss print('epoch:', ep) print('loss:', loss) torch.save(lm, '../model/lm_%d.model' % (ep + 1))
import torch import lm as L import dataset epoch = 0 dictPath = '../model/ds.pickle' testDataPath = '../data/ptb.test.txt' modelPath = '../model/lm_%d.model'%epoch ds = dataset.Dataset() ds.load('../model/dicts.pickle') ds.setData(testDataPath) ds.setIdData() lm = L.LM(len(ds.word2id)) lm = torch.load(modelPath % epoch) lm.eval() # 評価モードにする H = 0 W = 0 for idLine in ds.idData: H += lm.getSentenceLogProb(idLine) W += len(idLine) - 1 H /= W print('entropy:', H) print('PPL:', 2**H)
def main(): # ====================== # 超参数 # ====================== CELL = "lstm" # rnn, gru, lstm DATASET = 'movie' RATIO = 0.9 WORD_DROP = 10 MIN_LEN = 5 MAX_LEN = 200 BATCH_SIZE = 32 SEQUENCE_LEN = 50 EMBED_SIZE = 128 HIDDEN_DIM = 256 NUM_LAYERS = 2 DROPOUT_RATE = 0.0 EPOCH = 300 LEARNING_RATE = 0.01 MAX_GENERATE_LENGTH = 20 GENERATE_EVERY = 5 SEED = 100 all_var = locals() print() for var in all_var: if var != "var_name": print("{0:15} ".format(var), all_var[var]) print() # ====================== # 数据 # ====================== data_path = '../../__data/ROCStories.txt' train_path = 'train_roc' test_path = 'test_roc' vocabulary = utils.Vocabulary(data_path, max_len=MAX_LEN, min_len=MIN_LEN, word_drop=WORD_DROP) utils.split_corpus(data_path, train_path, test_path, max_len=MAX_LEN, min_len=MIN_LEN, ratio=RATIO, seed=SEED) train = utils.Corpus(train_path, vocabulary, max_len=MAX_LEN, min_len=MIN_LEN) test = utils.Corpus(test_path, vocabulary, max_len=MAX_LEN, min_len=MIN_LEN) train_generator = utils.Generator(train.corpus) test_generator = utils.Generator(test.corpus) # ====================== # 构建模型 # ====================== device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = lm.LM(cell=CELL, vocab_size=vocabulary.vocab_size, embed_size=EMBED_SIZE, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout_rate=DROPOUT_RATE) model.to(device) summary(model, (20, )) criteration = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE) # optimizer = torch.optim.Adam(textRNN.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) print() # ====================== # 训练与测试 # ====================== best_loss = 1000000 for epoch in range(EPOCH): train_g = train_generator.build_generator(BATCH_SIZE, SEQUENCE_LEN) test_g = test_generator.build_generator(BATCH_SIZE, SEQUENCE_LEN) train_loss = [] while True: try: text = train_g.__next__() except: break optimizer.zero_grad() y = model(torch.from_numpy(text[:, :-1]).long().to(device)) loss = criteration( y.reshape(-1, vocabulary.vocab_size), torch.from_numpy(text[:, 1:]).reshape(-1).long().to(device)) loss.backward() optimizer.step() train_loss.append(loss.item()) test_loss = [] while True: with torch.no_grad(): try: text = test_g.__next__() except: break y = model(torch.from_numpy(text[:, :-1]).long().to(device)) loss = criteration( y.reshape(-1, vocabulary.vocab_size), torch.from_numpy(text[:, 1:]).reshape(-1).long().to(device)) test_loss.append(loss.item()) print('epoch {:d} training loss {:.4f} test loss {:.4f}'.format( epoch + 1, np.mean(train_loss), np.mean(test_loss))) if np.mean(test_loss) < best_loss: best_loss = np.mean(test_loss) print('-----------------------------------------------------') print('saving parameters') os.makedirs('models', exist_ok=True) torch.save(model.state_dict(), 'models/' + DATASET + '-' + str(epoch) + '.pkl') print('-----------------------------------------------------') if (epoch + 1) % GENERATE_EVERY == 0: with torch.no_grad(): # 生成文本 x = torch.LongTensor([[vocabulary.w2i['_BOS']]] * 3).to(device) for i in range(MAX_GENERATE_LENGTH): samp = model.sample(x) x = torch.cat([x, samp], dim=1) x = x.cpu().numpy() print('-----------------------------------------------------') for i in range(x.shape[0]): print(' '.join([ vocabulary.i2w[_] for _ in list(x[i, :]) if _ not in [ vocabulary.w2i['_BOS'], vocabulary.w2i['_EOS'], vocabulary.w2i['_PAD'] ] ])) print('-----------------------------------------------------')
# The stop condition for the optimization routine maxIts = 1000 dcostTol = 1e-5 dxTol = 1e-5 def stop(x, fx, dfdx, cost, it, step, dcost, dx, scales, updated): if dcost is None or dx is None or np.isnan(dcost) or np.isinf(dcost): done = False else: done = it >= maxIts if not done and updated: done = np.abs(dcost) < dcostTol or np.linalg.norm(dx) < dxTol return done # Optimize tryLM = True if tryLM: x, fx, dfdx, cost, it, step, dcost, dx, scales, updated = lm.LM(np).solve(f, df, x0, stop=stop, scaling=False) print("it", it) print("x", x) print("cost", cost) print("dcost", dcost) print("np.linalg.norm(dx)", np.linalg.norm(dx)) else: F = lambda x: f(x).sum() sol = scipy.optimize.minimize(F, x0, tol=1e-20) print(sol)
def main(): # ====================== # hyper-parameters # ====================== CELL = "lstm" # rnn, gru, lstm DATASET = 'tweet' # movie, news, tweet RATIO = 0.9 WORD_DROP = 10 MIN_LEN = 5 MAX_LEN = 200 BATCH_SIZE = 32 EMBED_SIZE = 350 HIDDEN_DIM = 512 NUM_LAYERS = 2 DROPOUT_RATE = 0.0 START_EPOCH = 0 EPOCH = 30 LEARNING_RATE = 0.001 MAX_GENERATE_LENGTH = 20 GENERATE_EVERY = 5 PRINT_EVERY = 1 SEED = 100 all_var = locals() print() for var in all_var: if var != "var_name": print("{0:15} ".format(var), all_var[var]) print() # ====================== # data # ====================== data_path = 'data/' + DATASET + '2020.txt' train_path = 'data/train_' + DATASET test_path = 'data/test_' + DATASET vocabulary = utils.Vocabulary(data_path, max_len=MAX_LEN, min_len=MIN_LEN, word_drop=WORD_DROP) utils.split_corpus(data_path, train_path, test_path, max_len=MAX_LEN, min_len=MIN_LEN, ratio=RATIO, seed=SEED) train = utils.Corpus(train_path, vocabulary, max_len=MAX_LEN, min_len=MIN_LEN) test = utils.Corpus(test_path, vocabulary, max_len=MAX_LEN, min_len=MIN_LEN) train_generator = utils.Generator(train.corpus, vocabulary=vocabulary) test_generator = utils.Generator(test.corpus, vocabulary=vocabulary) # ====================== # building model # ====================== device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) model = lm.LM(cell=CELL, vocab_size=vocabulary.vocab_size, embed_size=EMBED_SIZE, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout_rate=DROPOUT_RATE) model.to(device) total_params = sum(p.numel() for p in model.parameters()) print("Total params: {:d}".format(total_params)) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Trainable params: {:d}".format(total_trainable_params)) criterion = nn.NLLLoss(ignore_index=vocabulary.w2i["_PAD"]) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) print() # ====================== # training and testing # ====================== best_loss = 1000000 step = 0 if START_EPOCH > 0: model.load_state_dict( torch.load('models/' + DATASET + '-' + str(START_EPOCH) + '.pkl', map_location=device)) for epoch in range(START_EPOCH + 1, EPOCH + 1): train_g = train_generator.build_generator(BATCH_SIZE) test_g = test_generator.build_generator(BATCH_SIZE) train_loss = [] model.train() while True: try: text = train_g.__next__() except: break optimizer.zero_grad() text_in = text[:, :-1] text_target = text[:, 1:] y = model(torch.from_numpy(text_in).long().to(device)) loss = criterion( y.reshape(-1, vocabulary.vocab_size), torch.from_numpy(text_target).reshape(-1).long().to(device)) loss.backward() optimizer.step() train_loss.append(loss.item()) step += 1 torch.cuda.empty_cache() if step % PRINT_EVERY == 0: print('step {:d} training loss {:.4f}'.format( step, loss.item())) test_loss = [] model.eval() with torch.no_grad(): while True: try: text = test_g.__next__() except: break text_in = text[:, :-1] text_target = text[:, 1:] y = model(torch.from_numpy(text_in).long().to(device)) loss = criterion( y.reshape(-1, vocabulary.vocab_size), torch.from_numpy(text_target).reshape(-1).long().to( device)) test_loss.append(loss.item()) torch.cuda.empty_cache() print('epoch {:d} training loss {:.4f} test loss {:.4f}'.format( epoch, np.mean(train_loss), np.mean(test_loss))) if np.mean(test_loss) < best_loss: best_loss = np.mean(test_loss) print('-----------------------------------------------------') print('saving parameters') os.makedirs('models', exist_ok=True) torch.save(model.state_dict(), 'models/' + DATASET + '-' + str(epoch) + '.pkl') print('-----------------------------------------------------') if (epoch + 1) % GENERATE_EVERY == 0: model.eval() with torch.no_grad(): # generating text x = torch.LongTensor([[vocabulary.w2i['_BOS']]] * 3).to(device) for i in range(MAX_GENERATE_LENGTH): samp = model.sample(x) x = torch.cat([x, samp], dim=1) x = x.cpu().numpy() print('-----------------------------------------------------') for i in range(x.shape[0]): print(' '.join([ vocabulary.i2w[_] for _ in list(x[i, :]) if _ not in [ vocabulary.w2i['_BOS'], vocabulary.w2i['_EOS'], vocabulary.w2i['_PAD'] ] ])) print('-----------------------------------------------------')
def main(opt): prefix = ['latest_lm'] dataset = ['train', 'valid'] data, vocab = load_datasets(opt, dataset=dataset) logger.info('Loading data completed') opt.vocab_size = vocab.vocab_size init_scale = opt.init_scale logger.debug('Staring session...') sess_config = common_utils.get_tf_sess_config(opt) with tf.Session(config=sess_config) as sess: logger.debug('- Creating initializer ({} to {})'.format( -init_scale, init_scale)) initializer = tf.random_uniform_initializer(-init_scale, init_scale) logger.debug('- Creating training model...') with tf.variable_scope('LM', reuse=None, initializer=initializer): model = lm.LM(opt) train_op, lr_var = lm.train_op(model, model.opt) logger.debug('- Creating validating model (reuse params)...') with tf.variable_scope('LM', reuse=True, initializer=initializer): vmodel = lm.LM(opt, is_training=False) logger.debug('Trainable variables:') for v in tf.trainable_variables(): logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device)) logger.info('Initializing vairables...') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() states = {} for p in prefix: states[p] = common_utils.get_initial_training_state() states, _ = resume_many_states(opt.output_dir, sess, saver, states, prefix) state = states[prefix[0]] state.learning_rate = opt.learning_rate logger.info('Start training loop:') logger.debug('\n' + common_utils.SUN_BRO()) for epoch in range(state.epoch, opt.max_epochs): epoch_time = time.time() state.epoch = epoch logger.info("========= Start epoch {} =========".format(epoch + 1)) sess.run(tf.assign(lr_var, state.learning_rate)) logger.info("- Traning LM with learning rate {}...".format( state.learning_rate)) train_ppl, _ = run_epoch(sess, model, data['train'], opt, train_op=train_op) logger.info('- Validating LM...') valid_ppl, _ = run_epoch(sess, vmodel, data['valid'], opt) logger.info('----------------------------------') logger.info('LM post epoch routine...') done_training = run_post_epoch(train_ppl, valid_ppl, state, opt, sess=sess, saver=saver, best_prefix="best_lm", latest_prefix="latest_lm") logger.info('- Epoch time: {}s'.format(time.time() - epoch_time)) if done_training: break logger.info('Done training at epoch {}'.format(state.epoch + 1))
def main(opt_lm, opt_dm): vocab_emb_path = opt_lm.shared_emb_vocab vocab_lm_path = os.path.join(opt_lm.data_dir, opt_lm.vocab_file) train_lm_path = os.path.join(opt_lm.data_dir, opt_lm.train_file) valid_lm_path = os.path.join(opt_lm.data_dir, opt_lm.valid_file) vocab_dm_path = os.path.join(opt_dm.data_dir, opt_dm.vocab_file) train_dm_path = os.path.join(opt_dm.data_dir, opt_dm.train_file) dm_emb_path = os.path.join(opt_dm.data_dir, 'emb.cpickle') logger.info('Loading data set...') logger.debug('- Loading vocab shared emb from {}'.format(vocab_emb_path)) vocab_emb = data_utils.Vocabulary.from_vocab_file(vocab_emb_path) logger.debug('-- Shared emb vocab size: {}'.format(vocab_emb.vocab_size)) logger.debug('- Loading vocab LM from {}'.format(vocab_lm_path)) vocab_lm = data_utils.Vocabulary.from_vocab_file(vocab_lm_path) logger.debug('-- LM vocab size: {}'.format(vocab_lm.vocab_size)) logger.debug('- Loading vocab DM from {}'.format(vocab_dm_path)) vocab_dm = data_utils.Vocabulary.from_vocab_file(vocab_dm_path) logger.debug('-- DM vocab size: {}'.format(vocab_dm.vocab_size)) if opt_lm.sen_independent: logger.debug('- Loading train LM data from {}'.format(train_lm_path)) train_lm_iter = data_utils.SentenceIterator(vocab_lm, train_lm_path, x_vocab=vocab_emb) logger.debug('- Loading valid LM data from {}'.format(valid_lm_path)) valid_lm_iter = data_utils.SentenceIterator(vocab_lm, valid_lm_path, x_vocab=vocab_emb) else: logger.debug('- Loading train LM data from {}'.format(train_lm_path)) train_lm_iter = data_utils.DataIterator(vocab_lm, train_lm_path, x_vocab=vocab_emb) logger.debug('- Loading valid LM data from {}'.format(valid_lm_path)) valid_lm_iter = data_utils.DataIterator(vocab_lm, valid_lm_path, x_vocab=vocab_emb) logger.debug('- Loading train DM data from {}'.format(train_dm_path)) train_dm_iter = data_utils.SenLabelIterator(vocab_dm, train_dm_path, l_vocab=vocab_emb) opt_lm.vocab_size = vocab_lm.vocab_size opt_dm.vocab_size = vocab_dm.vocab_size opt_dm.num_steps = train_dm_iter._max_seq_len if opt_lm.shared_emb_lm_logit: logger.debug('-- Vocab mask detected, reloading LM data...') lm_vocab_mask = data_utils.Vocabulary.create_vocab_mask( vocab_lm, vocab_emb) if opt_lm.sen_independent: train_lm_iter = data_utils.SentenceIterator( vocab_emb, train_lm_path) valid_lm_iter = data_utils.SentenceIterator( vocab_emb, valid_lm_path) else: train_lm_iter = data_utils.DataIterator(vocab_emb, train_lm_path) valid_lm_iter = data_utils.DataIterator(vocab_emb, valid_lm_path) opt_lm.vocab_size = vocab_emb.vocab_size opt_lm.logit_mask = lm_vocab_mask logger.info('Loading data completed') init_scale = opt_lm.init_scale sess_config = tf.ConfigProto(log_device_placement=False) logger.info('Starting TF Session...') # with tf.device("/cpu:0"), tf.Session(config=sess_config) as sess: with tf.Session(config=sess_config) as sess: logger.debug('- Creating initializer ({} to {})'.format( -init_scale, init_scale)) initializer = tf.random_uniform_initializer(-init_scale, init_scale) logger.debug('- Creating shared embedding variables...') with tf.variable_scope('shared_emb'): shared_emb_vars = lm.sharded_variable( 'emb', [vocab_emb.vocab_size, opt_lm.emb_size], opt_lm.num_shards) logger.debug('- Loading embedding for DM...') with open(dm_emb_path) as ifp: emb_array = cPickle.load(ifp) dm_emb_init = tf.constant(emb_array, dtype=tf.float32) opt_lm.input_emb_vars = shared_emb_vars if opt_lm.shared_emb_lm_logit: opt_lm.softmax_w_vars = shared_emb_vars opt_dm.af_ex_emb_vars = shared_emb_vars opt_dm.input_emb_init = dm_emb_init opt_dm.input_emb_trainable = False logger.debug('- Creating training LM...') with tf.variable_scope('LM', reuse=None, initializer=initializer): train_lm = lm.LM(opt_lm, create_grads=False) logger.debug('- Creating validating LM (reuse params)...') with tf.variable_scope('LM', reuse=True, initializer=initializer): valid_lm = lm.LM(opt_lm, is_training=False) logger.debug('- Creating training DM...') with tf.variable_scope('DM', reuse=None, initializer=initializer): train_dm = lm.LMwAF(opt_dm, create_grads=False) logger.debug('- Creating training operation...') train_op, lr_var = get_joint_train_op(train_lm, train_dm, opt_lm, opt_dm) logger.debug('Trainable variables:') for v in tf.trainable_variables(): logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device)) logger.info('Initializing vairables...') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() state = common_utils.get_initial_training_state() state.learning_rate = opt_lm.learning_rate state, _ = resume_if_possible(opt_lm, sess, saver, state) logger.info('Start training loop:') logger.debug('\n' + common_utils.SUN_BRO()) global_steps = 0 train_dm_ppl = 0 for epoch in range(state.epoch, opt_lm.max_epochs): epoch_time = time.time() logger.info("========= Start epoch {} =========".format(epoch + 1)) sess.run(tf.assign(lr_var, state.learning_rate)) logger.info("- Learning rate = {}".format(state.learning_rate)) logger.info("Traning...") train_lm_ppl, train_dm_ppl, steps = run_joint_epoch( sess, train_lm, train_dm, train_lm_iter, train_dm_iter, opt_lm, train_op) global_steps += steps logger.info("Validating LM...") valid_lm_ppl, vsteps = run_epoch(sess, valid_lm, valid_lm_iter, opt_lm) logger.info('DM PPL = {}, Train ppl = {}, Valid ppl = {}'.format( train_dm_ppl, train_lm_ppl, valid_lm_ppl)) logger.info('----------------------------------') logger.info('Post epoch routine...') state.epoch = epoch + 1 state.val_ppl = valid_lm_ppl if valid_lm_ppl < state.best_val_ppl: logger.info('- Best PPL: {} -> {}'.format( state.best_val_ppl, valid_lm_ppl)) logger.info('- Epoch: {} -> {}'.format(state.best_epoch + 1, epoch + 1)) state.best_val_ppl = valid_lm_ppl state.best_epoch = epoch ckpt_path = os.path.join(opt_lm.output_dir, "best_model.ckpt") state_path = os.path.join(opt_lm.output_dir, "best_state.json") logger.info('- Saving best model to {}'.format(ckpt_path)) saver.save(sess, ckpt_path) with open(state_path, 'w') as ofp: json.dump(vars(state), ofp) else: logger.info('- No improvement!') done_training = update_lr(opt_lm, state) ckpt_path = os.path.join(opt_lm.output_dir, "latest_model.ckpt") state_path = os.path.join(opt_lm.output_dir, "latest_state.json") logger.info('End of epoch {}: '.format(epoch + 1)) logger.info('- Saving model to {}'.format(ckpt_path)) logger.info('- Epoch time: {}s'.format(time.time() - epoch_time)) saver.save(sess, ckpt_path) with open(state_path, 'w') as ofp: json.dump(vars(state), ofp) if done_training: break logger.debug('Updated state:\n{}'.format(state.__repr__())) logger.info('Done training at epoch {}'.format(state.epoch + 1))
def main(lm_opt, dm_opt): prefix = ['latest_lm', 'latest_dm'] dataset = ['train', 'valid'] shortlist_vocab_path = lm_opt.shortlist_path dm_emb_path = os.path.join(dm_opt.data_dir, 'emb.cpickle') logger.debug( '- Loading shortlist vocab from {}'.format(shortlist_vocab_path)) short_vocab = data_utils.Vocabulary.from_vocab_file(shortlist_vocab_path) logger.debug('-- Shortlist vocab size: {}'.format(short_vocab.vocab_size)) lm_data, lm_vocab = load_datasets(lm_opt, dataset=dataset, y_vocab=short_vocab) dm_data, dm_vocab = load_datasets( dm_opt, dataset=dataset, iterator_type=data_utils.SenLabelIterator, l_vocab=short_vocab) lm_opt.vocab_size = lm_vocab.vocab_size dm_opt.vocab_size = dm_vocab.vocab_size lm_vocab_mask = data_utils.Vocabulary.create_vocab_mask( lm_vocab, short_vocab) lm_opt.logit_mask = lm_vocab_mask logger.info('Loading data completed') init_scale = lm_opt.init_scale sess_config = common_utils.get_tf_sess_config(lm_opt) logger.info('Starting TF Session...') with tf.Session(config=sess_config) as sess: logger.debug('- Creating initializer ({} to {})'.format( -init_scale, init_scale)) initializer = tf.random_uniform_initializer(-init_scale, init_scale) logger.debug('- Creating shared embedding variables...') with tf.variable_scope('shared_emb'): shared_emb_vars = lm.sharded_variable( 'emb', [short_vocab.vocab_size, lm_opt.emb_size], lm_opt.num_shards) logger.debug('- Loading embedding for DM...') with open(dm_emb_path) as ifp: emb_array = cPickle.load(ifp) dm_emb_init = tf.constant(emb_array, dtype=tf.float32) lm_opt.softmax_w_vars = shared_emb_vars dm_opt.af_ex_emb_vars = shared_emb_vars dm_opt.input_emb_init = dm_emb_init dm_opt.input_emb_trainable = False logger.debug('- Creating training LM...') with tf.variable_scope('LM', reuse=None, initializer=initializer): lm_train = lm.LM(lm_opt) lm_train_op, lm_lr_var = lm.train_op(lm_train, lm_opt) logger.debug('- Creating validating LM (reuse params)...') with tf.variable_scope('LM', reuse=True, initializer=initializer): lm_valid = lm.LM(lm_opt, is_training=False) logger.debug('- Creating training DM...') with tf.variable_scope('DM', reuse=None, initializer=initializer): dm_train = lm.LMwAF(dm_opt) dm_train_op, dm_lr_var = lm.train_op(dm_train, dm_opt) logger.debug('- Creating validating DM (reuse params)...') with tf.variable_scope('DM', reuse=True, initializer=initializer): dm_valid = lm.LMwAF(dm_opt, is_training=False) logger.debug('Trainable variables:') for v in tf.trainable_variables(): logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device)) logger.info('Initializing vairables...') sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() states = {} for p in prefix: states[p] = common_utils.get_initial_training_state() states, _ = resume_many_states(lm_opt.output_dir, sess, saver, states, prefix) lm_state = states[prefix[0]] dm_state = states[prefix[1]] lm_state.learning_rate = lm_opt.learning_rate dm_state.learning_rate = dm_opt.learning_rate logger.info('Start training loop:') logger.debug('\n' + common_utils.SUN_BRO()) for epoch in range(lm_state.epoch, lm_opt.max_epochs): epoch_time = time.time() logger.info("========= Start epoch {} =========".format(epoch + 1)) sess.run(tf.assign(lm_lr_var, lm_state.learning_rate)) sess.run(tf.assign(dm_lr_var, dm_state.learning_rate)) logger.info("- Traning DM with learning rate {}...".format( dm_state.learning_rate)) dm_train_ppl, _ = run_epoch(sess, dm_train, dm_data['train'], dm_opt, train_op=dm_train_op) logger.info('- Validating DM...') dm_valid_ppl, _ = run_epoch(sess, dm_valid, dm_data['valid'], dm_opt) logger.info("- Traning LM with learning rate {}...".format( lm_state.learning_rate)) lm_train_ppl, _ = run_epoch(sess, lm_train, lm_data['train'], lm_opt, train_op=lm_train_op) logger.info('- Validating LM...') lm_valid_ppl, _ = run_epoch(sess, lm_valid, lm_data['valid'], lm_opt) logger.info('----------------------------------') logger.info('LM post epoch routine...') done_training = run_post_epoch(lm_train_ppl, lm_valid_ppl, lm_state, lm_opt, sess=sess, saver=saver, best_prefix="best_lm", latest_prefix="latest_lm") logger.info('----------------------------------') logger.info('DM post epoch routine...') run_post_epoch(dm_train_ppl, dm_valid_ppl, dm_state, dm_opt, best_prefix="best_dm", latest_prefix="latest_dm") logger.info('- Epoch time: {}s'.format(time.time() - epoch_time)) if done_training: break logger.info('Done training at epoch {}'.format(lm_state.epoch + 1))
def main(): # ====================== # hyperparameters # ======================' CELL = "lstm" # rnn, gru, lstm DATASET = 'movie' # movie, news, tweet WORD_DROP = 10 MIN_LEN = 5 MAX_LEN = 200 EMBED_SIZE = 350 HIDDEN_DIM = 512 NUM_LAYERS = 2 DROPOUT_RATE = 0.0 MAX_GENERATE_LENGTH = 200 GENERATE_NUM = 1000 if DATASET == 'movie': LOAD_EPOCH = 6 elif DATASET == 'news': LOAD_EPOCH = 10 elif DATASET == 'tweet': LOAD_EPOCH = 6 else: raise Exception all_var = locals() print() for var in all_var: if var != "var_name": print("{0:15} ".format(var), all_var[var]) print() # ====================== # data # ====================== data_path = '../../data/' + DATASET + '2020.txt' vocabulary = utils.Vocabulary( data_path, max_len=MAX_LEN, min_len=MIN_LEN, word_drop=WORD_DROP ) # ====================== # building model # ====================== device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(device) model = lm.LM( cell=CELL, vocab_size=vocabulary.vocab_size, embed_size=EMBED_SIZE, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout_rate=DROPOUT_RATE ) model.to(device) total_params = sum(p.numel() for p in model.parameters()) print("Total params: {:d}".format(total_params)) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Trainable params: {:d}".format(total_trainable_params)) model.load_state_dict(torch.load('../../models/' + DATASET + '-' + str(LOAD_EPOCH) + '.pkl', map_location=device)) print('checkpoint loaded') print() # ====================== # start steganography # ====================== # read bit streams with open('../../bit_stream/bit_stream.txt', 'r', encoding='utf8') as f: bit_stream = f.read().strip() bit_stream += bit_stream bit_index = int(torch.randint(0, high=1000, size=(1,))) model.eval() with torch.no_grad(): all_kl = [] stega_text = [] stega_bits = [] while len(stega_text) < 10: stega_sentence = [] stega_bit = '' x = torch.LongTensor([[vocabulary.w2i['_BOS']]]).to(device) samp = model.sample(x) stega_sentence.append(vocabulary.i2w[samp.reshape(-1).cpu().numpy()[0]]) x = torch.cat([x, samp], dim=1) for i in range(MAX_GENERATE_LENGTH - 1): print(len(stega_text), i) if '_EOS' in stega_sentence: break # conditional probability distribution logits = model(x, logits=True) logits = logits[:, -1, :].reshape(-1) logits[1] = -1e20 log_prob = F.log_softmax(logits) prob = F.softmax(logits) def split2(prob, indices, k): prob = prob / prob.sum() if prob[0] > 0.5: return None bit = 1 while (1 / 2 ** (bit + 1)) > prob[0]: bit += 1 mean = 1 / 2 ** bit # dp prob = prob.tolist() indices = indices.tolist() result = [] for i in range(2 ** bit): result.append([[], [], []]) for i in range(2 ** bit - 1): result[i][0].append(prob[0]) result[i][1].append(indices[0]) result[i][2] = k + bit del (prob[0]) del (indices[0]) while sum(result[i][0]) < mean: delta = mean - sum(result[i][0]) index = near(prob, delta) if prob[index] - delta < delta: result[i][0].append(prob[index]) result[i][1].append(indices[index]) del (prob[index]) del (indices[index]) else: break mean = sum(prob) / (2 ** bit - i - 1) result[2 ** bit - 1][0].extend(prob) result[2 ** bit - 1][1].extend(indices) result[2 ** bit - 1][2] = k + bit return result queue = collections.deque() queue.append((torch.LongTensor(list(range(prob.shape[0]))), 0)) f = [] while len(queue) > 0: indices, k = queue.popleft() p = prob[indices] p, ind = p.sort(descending=True) indices = indices[ind] result = split2(p, indices, k) if result is None: f.append((indices, k)) else: for _ in range(len(result)): queue.append((torch.LongTensor(result[_][1]), result[_][2])) q = prob.clone() q[:] = 0 for ind, k in f: q[ind] = prob[ind] / prob[ind].sum() * 0.5**k prob, indices = prob.sort(descending=True) # start recursion bit_tmp = 0 while prob[0] <= 0.5: # embedding bit bit = 1 while (1 / 2 ** (bit + 1)) > prob[0]: bit += 1 mean = 1 / 2 ** bit # dp prob = prob.tolist() indices = indices.tolist() result = [] for i in range(2 ** bit): result.append([[], []]) for i in range(2 ** bit - 1): result[i][0].append(prob[0]) result[i][1].append(indices[0]) del (prob[0]) del (indices[0]) while sum(result[i][0]) < mean: delta = mean - sum(result[i][0]) index = near(prob, delta) if prob[index] - delta < delta: result[i][0].append(prob[index]) result[i][1].append(indices[index]) del (prob[index]) del (indices[index]) else: break mean = sum(prob) / (2 ** bit - i - 1) result[2 ** bit - 1][0].extend(prob) result[2 ** bit - 1][1].extend(indices) # read secret message bit_embed = [int(_) for _ in bit_stream[bit_index + bit_tmp:bit_index + bit_tmp + bit]] int_embed = bits2int(bit_embed) # updating prob = torch.FloatTensor(result[int_embed][0]).to(device) indices = torch.LongTensor(result[int_embed][1]).to(device) prob = prob / prob.sum() prob, _ = prob.sort(descending=True) indices = indices[_] bit_tmp += bit # terminate gen = int(indices[int(torch.multinomial(prob, 1))]) stega_sentence += [vocabulary.i2w[gen]] if vocabulary.i2w[gen] == '_EOS': break x = torch.cat([x, torch.LongTensor([[gen]]).to(device)], dim=1).to(device) stega_bit += bit_stream[bit_index:bit_index + bit_tmp] bit_index += bit_tmp all_kl.append(kl(q, q.log(), log_prob)) # check if '_EOS' in stega_sentence: stega_sentence.remove('_EOS') if (len(stega_sentence) <= MAX_LEN) and (len(stega_sentence) >= MIN_LEN): stega_text.append(stega_sentence) stega_bits.append(stega_bit) print(np.mean(all_kl))
def read_lm(src): return lm.LM(src)
def main(): # ====================== # hyper-parameters # ======================' CELL = "lstm" # rnn, gru, lstm DATASET = 'tweet' # movie, news, tweet WORD_DROP = 10 MIN_LEN = 5 MAX_LEN = 200 EMBED_SIZE = 350 HIDDEN_DIM = 512 NUM_LAYERS = 2 DROPOUT_RATE = 0.0 MAX_GENERATE_LENGTH = 200 GENERATE_NUM = 1000 if DATASET == 'movie': LOAD_EPOCH = 6 elif DATASET == 'news': LOAD_EPOCH = 10 elif DATASET == 'tweet': LOAD_EPOCH = 6 else: raise Exception all_var = locals() print() for var in all_var: if var != "var_name": print("{0:15} ".format(var), all_var[var]) print() # ====================== # 数据 # ====================== data_path = 'data/' + DATASET + '2020.txt' vocabulary = utils.Vocabulary(data_path, max_len=MAX_LEN, min_len=MIN_LEN, word_drop=WORD_DROP) # ====================== # building model # ====================== device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(device) model = lm.LM(cell=CELL, vocab_size=vocabulary.vocab_size, embed_size=EMBED_SIZE, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout_rate=DROPOUT_RATE) model.to(device) total_params = sum(p.numel() for p in model.parameters()) print("Total params: {:d}".format(total_params)) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Trainable params: {:d}".format(total_trainable_params)) model.load_state_dict( torch.load('models/' + DATASET + '-' + str(LOAD_EPOCH) + '.pkl', map_location=device)) print('checkpoint loaded') print() # ====================== # starting steganography # ====================== os.makedirs('stego/' + DATASET, exist_ok=True) # read bit streams with open('bit_stream/bit_stream.txt', 'r', encoding='utf8') as f: bit_stream = f.read().strip() bit_stream += bit_stream bit_index = int(torch.randint(0, high=1000, size=(1, ))) model.eval() with torch.no_grad(): stega_text = [] stega_bits = [] while len(stega_text) < GENERATE_NUM: print(len(stega_text)) stega_sentence = [] stega_bit = '' x = torch.LongTensor([[vocabulary.w2i['_BOS']]]).to(device) samp = model.sample(x) stega_sentence.append( vocabulary.i2w[samp.reshape(-1).cpu().numpy()[0]]) x = torch.cat([x, samp], dim=1) for i in range(MAX_GENERATE_LENGTH - 1): if '_EOS' in stega_sentence: break # conditional probability distribution log_prob = model(x) prob = torch.exp(log_prob)[:, -1, :].reshape(-1) prob[1] = 0 # set unk to zero prob = prob / prob.sum() prob, indices = prob.sort(descending=True) # start recursion bit_tmp = 0 while prob[0] <= 0.5: # embedding bit bit = 1 while (1 / 2**(bit + 1)) > prob[0]: bit += 1 mean = 1 / 2**bit # dp prob = prob.tolist() indices = indices.tolist() result = [] for i in range(2**bit): result.append([[], []]) for i in range(2**bit - 1): result[i][0].append(prob[0]) result[i][1].append(indices[0]) del (prob[0]) del (indices[0]) while sum(result[i][0]) < mean: delta = mean - sum(result[i][0]) index = near(prob, delta) if prob[index] - delta < delta: result[i][0].append(prob[index]) result[i][1].append(indices[index]) del (prob[index]) del (indices[index]) else: break mean = sum(prob) / (2**bit - i - 1) result[2**bit - 1][0].extend(prob) result[2**bit - 1][1].extend(indices) # read secret message bit_embed = [ int(_) for _ in bit_stream[bit_index + bit_tmp:bit_index + bit_tmp + bit] ] int_embed = bits2int(bit_embed) # updating prob = torch.FloatTensor(result[int_embed][0]).to(device) indices = torch.LongTensor(result[int_embed][1]).to(device) prob = prob / prob.sum() prob, _ = prob.sort(descending=True) indices = indices[_] bit_tmp += bit # terminate gen = int(indices[int(torch.multinomial(prob, 1))]) stega_sentence += [vocabulary.i2w[gen]] if vocabulary.i2w[gen] == '_EOS': break x = torch.cat([x, torch.LongTensor([[gen]]).to(device)], dim=1).to(device) stega_bit += bit_stream[bit_index:bit_index + bit_tmp] bit_index += bit_tmp # check if '_EOS' in stega_sentence: stega_sentence.remove('_EOS') if (len(stega_sentence) <= MAX_LEN) and (len(stega_sentence) >= MIN_LEN): stega_text.append(stega_sentence) stega_bits.append(stega_bit) # write files with open('stego/' + DATASET + '/adg.txt', 'w', encoding='utf8') as f: for sentence in stega_text: f.write(' '.join(sentence) + '\n') with open('stego/' + DATASET + '/adg.bit', 'w', encoding='utf8') as f: for bits in stega_bits: f.write(bits + '\n')
def load(self): self.ds = pickle.load(open(self.datasetPath, 'rb')) self.lm = L.LM( len(self.ds.char2id) + 2, self.ds.id2char, self.charVecPath, self.uniProbDictPath) serializers.load_npz(self.modelPath, self.lm)