예제 #1
0
def learn_lm(weighted_strings, weighted_string_pairs):
    result = lm.LM()
    tmp = []
    for ((s1, w1), (s2, w2)) in weighted_string_pairs:
        tmp = tmp + [(s1, w1), (s2, w2)]
    result.learn_lm(weighted_strings + tmp)
    return result
예제 #2
0
    def setupLM(self):
        # language model
        self.lm = L.LM(
            len(self.ds.char2id) + 2, self.ds.id2char, charVecPath,
            uniProbDictPath)

        self.lm.bowIndice = len(self.ds.char2id)
        self.lm.eowIndice = self.lm.bowIndice + 1
예제 #3
0
    def __init__(self):
        self.ds_train = dataset.Dataset(config.trainPath, config.dictPath)
        self.ds_valid = dataset.Dataset(config.validPath, config.dictPath)
        self.ds_test = dataset.Dataset(config.testPath, config.dictPath)

        vocSize = len(self.ds_train.word2id)
        maxSeqLen = max([len(idLine) for idLine in self.ds_train.idData])
        padId = self.ds_train.word2id['<PAD>']
        self.model = lm.LM(vocSize, maxSeqLen, padId)
        # vocSize+1 for padding indice

        self.loss_log = logger.Logger('../result/loss.log')
        self.eval_log = logger.Logger('../result/eval.log')
예제 #4
0
def main(lm_opt):
    model_prefix = ['best_lm']
    dataset = ['test']
    lm_data, lm_vocab = load_datasets(lm_opt, dataset=dataset)
    lm_opt.vocab_size = lm_vocab.vocab_size
    logger.info('Loading data completed')
    init_scale = lm_opt.init_scale
    sess_config = common_utils.get_tf_sess_config(lm_opt)
    logger.info('Starting TF Session...')
    with tf.Session(config=sess_config) as sess:
        logger.info('Creating model...')
        init_scale = lm_opt.init_scale
        logger.debug('- Creating initializer ({} to {})'.format(
            -init_scale, init_scale))
        initializer = tf.random_uniform_initializer(-init_scale, init_scale)
        logger.debug('- Creating training LM...')
        with tf.variable_scope('LM', reuse=None, initializer=initializer):
            model = lm.LM(lm_opt, is_training=False)
        logger.debug('Trainable variables:')
        logger.debug('Trainable variables:')
        for v in tf.trainable_variables():
            logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device))
        logger.info('Initializing vairables...')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        states = {}
        for p in model_prefix:
            states[p] = common_utils.get_initial_training_state()
        _, success = resume_many_states(lm_opt.output_dir, sess, saver, states,
                                        model_prefix)
        if not success:
            logger.error('Failed to load the model. Testing aborted.')
            return
        logger.info('Testing...')
        token_loss = None
        if opt.out_token_loss_file is not None:
            token_loss = []
        ppl, _ = run_epoch(sess,
                           model,
                           lm_data['test'],
                           lm_opt,
                           token_loss=token_loss)
        logger.info('PPL = {}'.format(ppl))
        if token_loss is not None:
            logger.info('Writing token loss...')
            token_loss_path = os.path.join(opt.output_dir,
                                           opt.out_token_loss_file)
            with open(token_loss_path, 'w') as ofp:
                for p in token_loss:
                    ofp.write("{}\t{}\n".format(lm_vocab.i2w(p[0]), p[1]))
예제 #5
0
import torch
import torch.optim as optim
import lm as L
# 作成したlm.py
import dataset
# 作成したdataset.py
maxEpoch = 10  # 最大学習回数
dictPath = '../model/dicts.pickle'
modelPath = '../model/lm_%d.model'
trainDataPath = '../data/ptb.train.txt'

ds = dataset.Dataset(trainDataPath)
ds.save(dictPath)

lm = L.LM(vocSize=len(ds.word2id))
lm.train()

opt = optim.Adam(lm.parameters())

for ep in range(maxEpoch):
    accLoss = 0
    for idLine in ds.idData:
        opt.zero_grad()
        loss = lm.getLoss(idLine)
        loss.backward()
        opt.step()
        accLoss += loss

    print('epoch:', ep)
    print('loss:', loss)
    torch.save(lm, '../model/lm_%d.model' % (ep + 1))
예제 #6
0
import torch
import lm as L
import dataset

epoch = 0

dictPath = '../model/ds.pickle'
testDataPath = '../data/ptb.test.txt'
modelPath = '../model/lm_%d.model'%epoch

ds = dataset.Dataset()

ds.load('../model/dicts.pickle')
ds.setData(testDataPath)
ds.setIdData()

lm = L.LM(len(ds.word2id))
lm = torch.load(modelPath % epoch)
lm.eval() # 評価モードにする

H = 0
W = 0
for idLine in ds.idData:
    H += lm.getSentenceLogProb(idLine)
    W += len(idLine) - 1
H /= W
print('entropy:', H)
print('PPL:', 2**H)
예제 #7
0
def main():
    # ======================
    # 超参数
    # ======================
    CELL = "lstm"  # rnn, gru, lstm
    DATASET = 'movie'
    RATIO = 0.9
    WORD_DROP = 10
    MIN_LEN = 5
    MAX_LEN = 200
    BATCH_SIZE = 32
    SEQUENCE_LEN = 50
    EMBED_SIZE = 128
    HIDDEN_DIM = 256
    NUM_LAYERS = 2
    DROPOUT_RATE = 0.0
    EPOCH = 300
    LEARNING_RATE = 0.01
    MAX_GENERATE_LENGTH = 20
    GENERATE_EVERY = 5
    SEED = 100

    all_var = locals()
    print()
    for var in all_var:
        if var != "var_name":
            print("{0:15}   ".format(var), all_var[var])
    print()

    # ======================
    # 数据
    # ======================
    data_path = '../../__data/ROCStories.txt'
    train_path = 'train_roc'
    test_path = 'test_roc'
    vocabulary = utils.Vocabulary(data_path,
                                  max_len=MAX_LEN,
                                  min_len=MIN_LEN,
                                  word_drop=WORD_DROP)
    utils.split_corpus(data_path,
                       train_path,
                       test_path,
                       max_len=MAX_LEN,
                       min_len=MIN_LEN,
                       ratio=RATIO,
                       seed=SEED)
    train = utils.Corpus(train_path,
                         vocabulary,
                         max_len=MAX_LEN,
                         min_len=MIN_LEN)
    test = utils.Corpus(test_path,
                        vocabulary,
                        max_len=MAX_LEN,
                        min_len=MIN_LEN)
    train_generator = utils.Generator(train.corpus)
    test_generator = utils.Generator(test.corpus)

    # ======================
    # 构建模型
    # ======================
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = lm.LM(cell=CELL,
                  vocab_size=vocabulary.vocab_size,
                  embed_size=EMBED_SIZE,
                  hidden_dim=HIDDEN_DIM,
                  num_layers=NUM_LAYERS,
                  dropout_rate=DROPOUT_RATE)
    model.to(device)
    summary(model, (20, ))
    criteration = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
    # optimizer = torch.optim.Adam(textRNN.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    print()

    # ======================
    # 训练与测试
    # ======================
    best_loss = 1000000
    for epoch in range(EPOCH):
        train_g = train_generator.build_generator(BATCH_SIZE, SEQUENCE_LEN)
        test_g = test_generator.build_generator(BATCH_SIZE, SEQUENCE_LEN)
        train_loss = []
        while True:
            try:
                text = train_g.__next__()
            except:
                break
            optimizer.zero_grad()
            y = model(torch.from_numpy(text[:, :-1]).long().to(device))
            loss = criteration(
                y.reshape(-1, vocabulary.vocab_size),
                torch.from_numpy(text[:, 1:]).reshape(-1).long().to(device))
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        test_loss = []
        while True:
            with torch.no_grad():
                try:
                    text = test_g.__next__()
                except:
                    break
                y = model(torch.from_numpy(text[:, :-1]).long().to(device))
                loss = criteration(
                    y.reshape(-1, vocabulary.vocab_size),
                    torch.from_numpy(text[:,
                                          1:]).reshape(-1).long().to(device))
                test_loss.append(loss.item())

        print('epoch {:d}   training loss {:.4f}    test loss {:.4f}'.format(
            epoch + 1, np.mean(train_loss), np.mean(test_loss)))

        if np.mean(test_loss) < best_loss:
            best_loss = np.mean(test_loss)
            print('-----------------------------------------------------')
            print('saving parameters')
            os.makedirs('models', exist_ok=True)
            torch.save(model.state_dict(),
                       'models/' + DATASET + '-' + str(epoch) + '.pkl')
            print('-----------------------------------------------------')

        if (epoch + 1) % GENERATE_EVERY == 0:
            with torch.no_grad():
                # 生成文本
                x = torch.LongTensor([[vocabulary.w2i['_BOS']]] * 3).to(device)
                for i in range(MAX_GENERATE_LENGTH):
                    samp = model.sample(x)
                    x = torch.cat([x, samp], dim=1)
                x = x.cpu().numpy()
            print('-----------------------------------------------------')
            for i in range(x.shape[0]):
                print(' '.join([
                    vocabulary.i2w[_] for _ in list(x[i, :]) if _ not in [
                        vocabulary.w2i['_BOS'], vocabulary.w2i['_EOS'],
                        vocabulary.w2i['_PAD']
                    ]
                ]))
            print('-----------------------------------------------------')
예제 #8
0
# The stop condition for the optimization routine
maxIts = 1000
dcostTol = 1e-5
dxTol = 1e-5


def stop(x, fx, dfdx, cost, it, step, dcost, dx, scales, updated):
    if dcost is None or dx is None or np.isnan(dcost) or np.isinf(dcost):
        done = False
    else:
        done = it >= maxIts
        if not done and updated:
            done = np.abs(dcost) < dcostTol or np.linalg.norm(dx) < dxTol
    return done


# Optimize
tryLM = True
if tryLM:
    x, fx, dfdx, cost, it, step, dcost, dx, scales, updated = lm.LM(np).solve(f, df, x0, stop=stop, scaling=False)
    print("it", it)
    print("x", x)
    print("cost", cost)
    print("dcost", dcost)
    print("np.linalg.norm(dx)", np.linalg.norm(dx))
else:
    F = lambda x: f(x).sum()
    sol = scipy.optimize.minimize(F, x0, tol=1e-20)
    print(sol)
예제 #9
0
def main():
    # ======================
    # hyper-parameters
    # ======================
    CELL = "lstm"  # rnn, gru, lstm
    DATASET = 'tweet'  # movie, news, tweet
    RATIO = 0.9
    WORD_DROP = 10
    MIN_LEN = 5
    MAX_LEN = 200
    BATCH_SIZE = 32
    EMBED_SIZE = 350
    HIDDEN_DIM = 512
    NUM_LAYERS = 2
    DROPOUT_RATE = 0.0
    START_EPOCH = 0
    EPOCH = 30
    LEARNING_RATE = 0.001
    MAX_GENERATE_LENGTH = 20
    GENERATE_EVERY = 5
    PRINT_EVERY = 1
    SEED = 100

    all_var = locals()
    print()
    for var in all_var:
        if var != "var_name":
            print("{0:15}   ".format(var), all_var[var])
    print()

    # ======================
    # data
    # ======================
    data_path = 'data/' + DATASET + '2020.txt'
    train_path = 'data/train_' + DATASET
    test_path = 'data/test_' + DATASET
    vocabulary = utils.Vocabulary(data_path,
                                  max_len=MAX_LEN,
                                  min_len=MIN_LEN,
                                  word_drop=WORD_DROP)
    utils.split_corpus(data_path,
                       train_path,
                       test_path,
                       max_len=MAX_LEN,
                       min_len=MIN_LEN,
                       ratio=RATIO,
                       seed=SEED)
    train = utils.Corpus(train_path,
                         vocabulary,
                         max_len=MAX_LEN,
                         min_len=MIN_LEN)
    test = utils.Corpus(test_path,
                        vocabulary,
                        max_len=MAX_LEN,
                        min_len=MIN_LEN)
    train_generator = utils.Generator(train.corpus, vocabulary=vocabulary)
    test_generator = utils.Generator(test.corpus, vocabulary=vocabulary)

    # ======================
    # building model
    # ======================
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model = lm.LM(cell=CELL,
                  vocab_size=vocabulary.vocab_size,
                  embed_size=EMBED_SIZE,
                  hidden_dim=HIDDEN_DIM,
                  num_layers=NUM_LAYERS,
                  dropout_rate=DROPOUT_RATE)
    model.to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print("Total params: {:d}".format(total_params))
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print("Trainable params: {:d}".format(total_trainable_params))
    criterion = nn.NLLLoss(ignore_index=vocabulary.w2i["_PAD"])
    optimizer = optim.Adam(model.parameters(),
                           lr=LEARNING_RATE,
                           betas=(0.9, 0.999),
                           eps=1e-08,
                           weight_decay=0,
                           amsgrad=False)
    print()

    # ======================
    # training and testing
    # ======================
    best_loss = 1000000
    step = 0
    if START_EPOCH > 0:
        model.load_state_dict(
            torch.load('models/' + DATASET + '-' + str(START_EPOCH) + '.pkl',
                       map_location=device))
    for epoch in range(START_EPOCH + 1, EPOCH + 1):
        train_g = train_generator.build_generator(BATCH_SIZE)
        test_g = test_generator.build_generator(BATCH_SIZE)
        train_loss = []
        model.train()
        while True:
            try:
                text = train_g.__next__()
            except:
                break
            optimizer.zero_grad()
            text_in = text[:, :-1]
            text_target = text[:, 1:]
            y = model(torch.from_numpy(text_in).long().to(device))
            loss = criterion(
                y.reshape(-1, vocabulary.vocab_size),
                torch.from_numpy(text_target).reshape(-1).long().to(device))
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            step += 1
            torch.cuda.empty_cache()

            if step % PRINT_EVERY == 0:
                print('step {:d} training loss {:.4f}'.format(
                    step, loss.item()))

        test_loss = []
        model.eval()
        with torch.no_grad():
            while True:
                try:
                    text = test_g.__next__()
                except:
                    break
                text_in = text[:, :-1]
                text_target = text[:, 1:]
                y = model(torch.from_numpy(text_in).long().to(device))
                loss = criterion(
                    y.reshape(-1, vocabulary.vocab_size),
                    torch.from_numpy(text_target).reshape(-1).long().to(
                        device))
                test_loss.append(loss.item())
                torch.cuda.empty_cache()

        print('epoch {:d}   training loss {:.4f}    test loss {:.4f}'.format(
            epoch, np.mean(train_loss), np.mean(test_loss)))

        if np.mean(test_loss) < best_loss:
            best_loss = np.mean(test_loss)
            print('-----------------------------------------------------')
            print('saving parameters')
            os.makedirs('models', exist_ok=True)
            torch.save(model.state_dict(),
                       'models/' + DATASET + '-' + str(epoch) + '.pkl')
            print('-----------------------------------------------------')

        if (epoch + 1) % GENERATE_EVERY == 0:
            model.eval()
            with torch.no_grad():
                # generating text
                x = torch.LongTensor([[vocabulary.w2i['_BOS']]] * 3).to(device)
                for i in range(MAX_GENERATE_LENGTH):
                    samp = model.sample(x)
                    x = torch.cat([x, samp], dim=1)
                x = x.cpu().numpy()
            print('-----------------------------------------------------')
            for i in range(x.shape[0]):
                print(' '.join([
                    vocabulary.i2w[_] for _ in list(x[i, :]) if _ not in [
                        vocabulary.w2i['_BOS'], vocabulary.w2i['_EOS'],
                        vocabulary.w2i['_PAD']
                    ]
                ]))
            print('-----------------------------------------------------')
예제 #10
0
def main(opt):
    prefix = ['latest_lm']
    dataset = ['train', 'valid']
    data, vocab = load_datasets(opt, dataset=dataset)
    logger.info('Loading data completed')
    opt.vocab_size = vocab.vocab_size
    init_scale = opt.init_scale
    logger.debug('Staring session...')
    sess_config = common_utils.get_tf_sess_config(opt)
    with tf.Session(config=sess_config) as sess:
        logger.debug('- Creating initializer ({} to {})'.format(
            -init_scale, init_scale))
        initializer = tf.random_uniform_initializer(-init_scale, init_scale)
        logger.debug('- Creating training model...')
        with tf.variable_scope('LM', reuse=None, initializer=initializer):
            model = lm.LM(opt)
            train_op, lr_var = lm.train_op(model, model.opt)
        logger.debug('- Creating validating model (reuse params)...')
        with tf.variable_scope('LM', reuse=True, initializer=initializer):
            vmodel = lm.LM(opt, is_training=False)
        logger.debug('Trainable variables:')
        for v in tf.trainable_variables():
            logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device))
        logger.info('Initializing vairables...')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        states = {}
        for p in prefix:
            states[p] = common_utils.get_initial_training_state()
        states, _ = resume_many_states(opt.output_dir, sess, saver, states,
                                       prefix)
        state = states[prefix[0]]
        state.learning_rate = opt.learning_rate
        logger.info('Start training loop:')
        logger.debug('\n' + common_utils.SUN_BRO())
        for epoch in range(state.epoch, opt.max_epochs):
            epoch_time = time.time()
            state.epoch = epoch
            logger.info("========= Start epoch {} =========".format(epoch + 1))
            sess.run(tf.assign(lr_var, state.learning_rate))
            logger.info("- Traning LM with learning rate {}...".format(
                state.learning_rate))
            train_ppl, _ = run_epoch(sess,
                                     model,
                                     data['train'],
                                     opt,
                                     train_op=train_op)
            logger.info('- Validating LM...')
            valid_ppl, _ = run_epoch(sess, vmodel, data['valid'], opt)
            logger.info('----------------------------------')
            logger.info('LM post epoch routine...')
            done_training = run_post_epoch(train_ppl,
                                           valid_ppl,
                                           state,
                                           opt,
                                           sess=sess,
                                           saver=saver,
                                           best_prefix="best_lm",
                                           latest_prefix="latest_lm")
            logger.info('- Epoch time: {}s'.format(time.time() - epoch_time))
            if done_training:
                break
        logger.info('Done training at epoch {}'.format(state.epoch + 1))
def main(opt_lm, opt_dm):
    vocab_emb_path = opt_lm.shared_emb_vocab
    vocab_lm_path = os.path.join(opt_lm.data_dir, opt_lm.vocab_file)
    train_lm_path = os.path.join(opt_lm.data_dir, opt_lm.train_file)
    valid_lm_path = os.path.join(opt_lm.data_dir, opt_lm.valid_file)
    vocab_dm_path = os.path.join(opt_dm.data_dir, opt_dm.vocab_file)
    train_dm_path = os.path.join(opt_dm.data_dir, opt_dm.train_file)
    dm_emb_path = os.path.join(opt_dm.data_dir, 'emb.cpickle')
    logger.info('Loading data set...')
    logger.debug('- Loading vocab shared emb from {}'.format(vocab_emb_path))
    vocab_emb = data_utils.Vocabulary.from_vocab_file(vocab_emb_path)
    logger.debug('-- Shared emb vocab size: {}'.format(vocab_emb.vocab_size))
    logger.debug('- Loading vocab LM from {}'.format(vocab_lm_path))
    vocab_lm = data_utils.Vocabulary.from_vocab_file(vocab_lm_path)
    logger.debug('-- LM vocab size: {}'.format(vocab_lm.vocab_size))
    logger.debug('- Loading vocab DM from {}'.format(vocab_dm_path))
    vocab_dm = data_utils.Vocabulary.from_vocab_file(vocab_dm_path)
    logger.debug('-- DM vocab size: {}'.format(vocab_dm.vocab_size))
    if opt_lm.sen_independent:
        logger.debug('- Loading train LM data from {}'.format(train_lm_path))
        train_lm_iter = data_utils.SentenceIterator(vocab_lm,
                                                    train_lm_path,
                                                    x_vocab=vocab_emb)
        logger.debug('- Loading valid LM data from {}'.format(valid_lm_path))
        valid_lm_iter = data_utils.SentenceIterator(vocab_lm,
                                                    valid_lm_path,
                                                    x_vocab=vocab_emb)
    else:
        logger.debug('- Loading train LM data from {}'.format(train_lm_path))
        train_lm_iter = data_utils.DataIterator(vocab_lm,
                                                train_lm_path,
                                                x_vocab=vocab_emb)
        logger.debug('- Loading valid LM data from {}'.format(valid_lm_path))
        valid_lm_iter = data_utils.DataIterator(vocab_lm,
                                                valid_lm_path,
                                                x_vocab=vocab_emb)
    logger.debug('- Loading train DM data from {}'.format(train_dm_path))
    train_dm_iter = data_utils.SenLabelIterator(vocab_dm,
                                                train_dm_path,
                                                l_vocab=vocab_emb)

    opt_lm.vocab_size = vocab_lm.vocab_size
    opt_dm.vocab_size = vocab_dm.vocab_size
    opt_dm.num_steps = train_dm_iter._max_seq_len

    if opt_lm.shared_emb_lm_logit:
        logger.debug('-- Vocab mask detected, reloading LM data...')
        lm_vocab_mask = data_utils.Vocabulary.create_vocab_mask(
            vocab_lm, vocab_emb)
        if opt_lm.sen_independent:
            train_lm_iter = data_utils.SentenceIterator(
                vocab_emb, train_lm_path)
            valid_lm_iter = data_utils.SentenceIterator(
                vocab_emb, valid_lm_path)
        else:
            train_lm_iter = data_utils.DataIterator(vocab_emb, train_lm_path)
            valid_lm_iter = data_utils.DataIterator(vocab_emb, valid_lm_path)
        opt_lm.vocab_size = vocab_emb.vocab_size
        opt_lm.logit_mask = lm_vocab_mask
    logger.info('Loading data completed')

    init_scale = opt_lm.init_scale
    sess_config = tf.ConfigProto(log_device_placement=False)
    logger.info('Starting TF Session...')
    # with tf.device("/cpu:0"), tf.Session(config=sess_config) as sess:
    with tf.Session(config=sess_config) as sess:
        logger.debug('- Creating initializer ({} to {})'.format(
            -init_scale, init_scale))
        initializer = tf.random_uniform_initializer(-init_scale, init_scale)
        logger.debug('- Creating shared embedding variables...')
        with tf.variable_scope('shared_emb'):
            shared_emb_vars = lm.sharded_variable(
                'emb', [vocab_emb.vocab_size, opt_lm.emb_size],
                opt_lm.num_shards)
        logger.debug('- Loading embedding for DM...')
        with open(dm_emb_path) as ifp:
            emb_array = cPickle.load(ifp)
            dm_emb_init = tf.constant(emb_array, dtype=tf.float32)
        opt_lm.input_emb_vars = shared_emb_vars
        if opt_lm.shared_emb_lm_logit:
            opt_lm.softmax_w_vars = shared_emb_vars
        opt_dm.af_ex_emb_vars = shared_emb_vars
        opt_dm.input_emb_init = dm_emb_init
        opt_dm.input_emb_trainable = False
        logger.debug('- Creating training LM...')
        with tf.variable_scope('LM', reuse=None, initializer=initializer):
            train_lm = lm.LM(opt_lm, create_grads=False)
        logger.debug('- Creating validating LM (reuse params)...')
        with tf.variable_scope('LM', reuse=True, initializer=initializer):
            valid_lm = lm.LM(opt_lm, is_training=False)
        logger.debug('- Creating training DM...')
        with tf.variable_scope('DM', reuse=None, initializer=initializer):
            train_dm = lm.LMwAF(opt_dm, create_grads=False)
        logger.debug('- Creating training operation...')
        train_op, lr_var = get_joint_train_op(train_lm, train_dm, opt_lm,
                                              opt_dm)
        logger.debug('Trainable variables:')
        for v in tf.trainable_variables():
            logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device))

        logger.info('Initializing vairables...')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        state = common_utils.get_initial_training_state()
        state.learning_rate = opt_lm.learning_rate
        state, _ = resume_if_possible(opt_lm, sess, saver, state)
        logger.info('Start training loop:')
        logger.debug('\n' + common_utils.SUN_BRO())
        global_steps = 0
        train_dm_ppl = 0
        for epoch in range(state.epoch, opt_lm.max_epochs):
            epoch_time = time.time()
            logger.info("========= Start epoch {} =========".format(epoch + 1))
            sess.run(tf.assign(lr_var, state.learning_rate))
            logger.info("- Learning rate = {}".format(state.learning_rate))
            logger.info("Traning...")
            train_lm_ppl, train_dm_ppl, steps = run_joint_epoch(
                sess, train_lm, train_dm, train_lm_iter, train_dm_iter, opt_lm,
                train_op)
            global_steps += steps
            logger.info("Validating LM...")
            valid_lm_ppl, vsteps = run_epoch(sess, valid_lm, valid_lm_iter,
                                             opt_lm)
            logger.info('DM PPL = {}, Train ppl = {}, Valid ppl = {}'.format(
                train_dm_ppl, train_lm_ppl, valid_lm_ppl))
            logger.info('----------------------------------')
            logger.info('Post epoch routine...')
            state.epoch = epoch + 1
            state.val_ppl = valid_lm_ppl
            if valid_lm_ppl < state.best_val_ppl:
                logger.info('- Best PPL: {} -> {}'.format(
                    state.best_val_ppl, valid_lm_ppl))
                logger.info('- Epoch: {} -> {}'.format(state.best_epoch + 1,
                                                       epoch + 1))
                state.best_val_ppl = valid_lm_ppl
                state.best_epoch = epoch
                ckpt_path = os.path.join(opt_lm.output_dir, "best_model.ckpt")
                state_path = os.path.join(opt_lm.output_dir, "best_state.json")
                logger.info('- Saving best model to {}'.format(ckpt_path))
                saver.save(sess, ckpt_path)
                with open(state_path, 'w') as ofp:
                    json.dump(vars(state), ofp)
            else:
                logger.info('- No improvement!')
            done_training = update_lr(opt_lm, state)
            ckpt_path = os.path.join(opt_lm.output_dir, "latest_model.ckpt")
            state_path = os.path.join(opt_lm.output_dir, "latest_state.json")
            logger.info('End of epoch {}: '.format(epoch + 1))
            logger.info('- Saving model to {}'.format(ckpt_path))
            logger.info('- Epoch time: {}s'.format(time.time() - epoch_time))
            saver.save(sess, ckpt_path)
            with open(state_path, 'w') as ofp:
                json.dump(vars(state), ofp)
            if done_training:
                break
            logger.debug('Updated state:\n{}'.format(state.__repr__()))
        logger.info('Done training at epoch {}'.format(state.epoch + 1))
def main(lm_opt, dm_opt):
    prefix = ['latest_lm', 'latest_dm']
    dataset = ['train', 'valid']
    shortlist_vocab_path = lm_opt.shortlist_path
    dm_emb_path = os.path.join(dm_opt.data_dir, 'emb.cpickle')
    logger.debug(
        '- Loading shortlist vocab from {}'.format(shortlist_vocab_path))
    short_vocab = data_utils.Vocabulary.from_vocab_file(shortlist_vocab_path)
    logger.debug('-- Shortlist vocab size: {}'.format(short_vocab.vocab_size))
    lm_data, lm_vocab = load_datasets(lm_opt,
                                      dataset=dataset,
                                      y_vocab=short_vocab)
    dm_data, dm_vocab = load_datasets(
        dm_opt,
        dataset=dataset,
        iterator_type=data_utils.SenLabelIterator,
        l_vocab=short_vocab)
    lm_opt.vocab_size = lm_vocab.vocab_size
    dm_opt.vocab_size = dm_vocab.vocab_size
    lm_vocab_mask = data_utils.Vocabulary.create_vocab_mask(
        lm_vocab, short_vocab)
    lm_opt.logit_mask = lm_vocab_mask

    logger.info('Loading data completed')

    init_scale = lm_opt.init_scale
    sess_config = common_utils.get_tf_sess_config(lm_opt)
    logger.info('Starting TF Session...')

    with tf.Session(config=sess_config) as sess:
        logger.debug('- Creating initializer ({} to {})'.format(
            -init_scale, init_scale))
        initializer = tf.random_uniform_initializer(-init_scale, init_scale)
        logger.debug('- Creating shared embedding variables...')
        with tf.variable_scope('shared_emb'):
            shared_emb_vars = lm.sharded_variable(
                'emb', [short_vocab.vocab_size, lm_opt.emb_size],
                lm_opt.num_shards)
        logger.debug('- Loading embedding for DM...')
        with open(dm_emb_path) as ifp:
            emb_array = cPickle.load(ifp)
            dm_emb_init = tf.constant(emb_array, dtype=tf.float32)
        lm_opt.softmax_w_vars = shared_emb_vars
        dm_opt.af_ex_emb_vars = shared_emb_vars
        dm_opt.input_emb_init = dm_emb_init
        dm_opt.input_emb_trainable = False
        logger.debug('- Creating training LM...')
        with tf.variable_scope('LM', reuse=None, initializer=initializer):
            lm_train = lm.LM(lm_opt)
            lm_train_op, lm_lr_var = lm.train_op(lm_train, lm_opt)
        logger.debug('- Creating validating LM (reuse params)...')
        with tf.variable_scope('LM', reuse=True, initializer=initializer):
            lm_valid = lm.LM(lm_opt, is_training=False)
        logger.debug('- Creating training DM...')
        with tf.variable_scope('DM', reuse=None, initializer=initializer):
            dm_train = lm.LMwAF(dm_opt)
            dm_train_op, dm_lr_var = lm.train_op(dm_train, dm_opt)
        logger.debug('- Creating validating DM (reuse params)...')
        with tf.variable_scope('DM', reuse=True, initializer=initializer):
            dm_valid = lm.LMwAF(dm_opt, is_training=False)
        logger.debug('Trainable variables:')
        for v in tf.trainable_variables():
            logger.debug("- {} {} {}".format(v.name, v.get_shape(), v.device))
        logger.info('Initializing vairables...')
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        states = {}
        for p in prefix:
            states[p] = common_utils.get_initial_training_state()
        states, _ = resume_many_states(lm_opt.output_dir, sess, saver, states,
                                       prefix)
        lm_state = states[prefix[0]]
        dm_state = states[prefix[1]]
        lm_state.learning_rate = lm_opt.learning_rate
        dm_state.learning_rate = dm_opt.learning_rate

        logger.info('Start training loop:')
        logger.debug('\n' + common_utils.SUN_BRO())

        for epoch in range(lm_state.epoch, lm_opt.max_epochs):
            epoch_time = time.time()
            logger.info("========= Start epoch {} =========".format(epoch + 1))
            sess.run(tf.assign(lm_lr_var, lm_state.learning_rate))
            sess.run(tf.assign(dm_lr_var, dm_state.learning_rate))
            logger.info("- Traning DM with learning rate {}...".format(
                dm_state.learning_rate))
            dm_train_ppl, _ = run_epoch(sess,
                                        dm_train,
                                        dm_data['train'],
                                        dm_opt,
                                        train_op=dm_train_op)
            logger.info('- Validating DM...')
            dm_valid_ppl, _ = run_epoch(sess, dm_valid, dm_data['valid'],
                                        dm_opt)
            logger.info("- Traning LM with learning rate {}...".format(
                lm_state.learning_rate))
            lm_train_ppl, _ = run_epoch(sess,
                                        lm_train,
                                        lm_data['train'],
                                        lm_opt,
                                        train_op=lm_train_op)
            logger.info('- Validating LM...')
            lm_valid_ppl, _ = run_epoch(sess, lm_valid, lm_data['valid'],
                                        lm_opt)
            logger.info('----------------------------------')
            logger.info('LM post epoch routine...')
            done_training = run_post_epoch(lm_train_ppl,
                                           lm_valid_ppl,
                                           lm_state,
                                           lm_opt,
                                           sess=sess,
                                           saver=saver,
                                           best_prefix="best_lm",
                                           latest_prefix="latest_lm")
            logger.info('----------------------------------')
            logger.info('DM post epoch routine...')
            run_post_epoch(dm_train_ppl,
                           dm_valid_ppl,
                           dm_state,
                           dm_opt,
                           best_prefix="best_dm",
                           latest_prefix="latest_dm")
            logger.info('- Epoch time: {}s'.format(time.time() - epoch_time))
            if done_training:
                break
        logger.info('Done training at epoch {}'.format(lm_state.epoch + 1))
예제 #13
0
def main():
    # ======================
    # hyperparameters
    # ======================'
    CELL = "lstm"                   # rnn, gru, lstm
    DATASET = 'movie'				# movie, news, tweet
    WORD_DROP = 10
    MIN_LEN = 5
    MAX_LEN = 200
    EMBED_SIZE = 350
    HIDDEN_DIM = 512
    NUM_LAYERS = 2
    DROPOUT_RATE = 0.0
    MAX_GENERATE_LENGTH = 200
    GENERATE_NUM = 1000

    if DATASET == 'movie':
        LOAD_EPOCH = 6
    elif DATASET == 'news':
        LOAD_EPOCH = 10
    elif DATASET == 'tweet':
        LOAD_EPOCH = 6
    else:
        raise Exception

    all_var = locals()
    print()
    for var in all_var:
        if var != "var_name":
            print("{0:15}   ".format(var), all_var[var])
    print()

    # ======================
    # data
    # ======================
    data_path = '../../data/' + DATASET + '2020.txt'
    vocabulary = utils.Vocabulary(
        data_path,
        max_len=MAX_LEN,
        min_len=MIN_LEN,
        word_drop=WORD_DROP
    )

    # ======================
    # building model
    # ======================
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = lm.LM(
        cell=CELL,
        vocab_size=vocabulary.vocab_size,
        embed_size=EMBED_SIZE,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        dropout_rate=DROPOUT_RATE
    )
    model.to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print("Total params: {:d}".format(total_params))
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {:d}".format(total_trainable_params))
    model.load_state_dict(torch.load('../../models/' + DATASET + '-' + str(LOAD_EPOCH) + '.pkl', map_location=device))
    print('checkpoint loaded')
    print()

    # ======================
    # start steganography
    # ======================
    # read bit streams
    with open('../../bit_stream/bit_stream.txt', 'r', encoding='utf8') as f:
        bit_stream = f.read().strip()
        bit_stream += bit_stream
    bit_index = int(torch.randint(0, high=1000, size=(1,)))

    model.eval()
    with torch.no_grad():
            all_kl = []
            stega_text = []
            stega_bits = []
            while len(stega_text) < 10:
                stega_sentence = []
                stega_bit = ''
                x = torch.LongTensor([[vocabulary.w2i['_BOS']]]).to(device)
                samp = model.sample(x)
                stega_sentence.append(vocabulary.i2w[samp.reshape(-1).cpu().numpy()[0]])
                x = torch.cat([x, samp], dim=1)

                for i in range(MAX_GENERATE_LENGTH - 1):
                    print(len(stega_text), i)
                    if '_EOS' in stega_sentence:
                        break
                    # conditional probability distribution
                    logits = model(x, logits=True)
                    logits = logits[:, -1, :].reshape(-1)
                    logits[1] = -1e20
                    log_prob = F.log_softmax(logits)
                    prob = F.softmax(logits)

                    def split2(prob, indices, k):
                        prob = prob / prob.sum()
                        if prob[0] > 0.5:
                            return None
                        bit = 1
                        while (1 / 2 ** (bit + 1)) > prob[0]:
                            bit += 1
                        mean = 1 / 2 ** bit
                        # dp
                        prob = prob.tolist()
                        indices = indices.tolist()
                        result = []
                        for i in range(2 ** bit):
                            result.append([[], [], []])
                        for i in range(2 ** bit - 1):
                            result[i][0].append(prob[0])
                            result[i][1].append(indices[0])
                            result[i][2] = k + bit
                            del (prob[0])
                            del (indices[0])
                            while sum(result[i][0]) < mean:
                                delta = mean - sum(result[i][0])
                                index = near(prob, delta)
                                if prob[index] - delta < delta:
                                    result[i][0].append(prob[index])
                                    result[i][1].append(indices[index])
                                    del (prob[index])
                                    del (indices[index])
                                else:
                                    break
                            mean = sum(prob) / (2 ** bit - i - 1)
                        result[2 ** bit - 1][0].extend(prob)
                        result[2 ** bit - 1][1].extend(indices)
                        result[2 ** bit - 1][2] = k + bit
                        return result

                    queue = collections.deque()
                    queue.append((torch.LongTensor(list(range(prob.shape[0]))), 0))
                    f = []
                    while len(queue) > 0:
                        indices, k = queue.popleft()
                        p = prob[indices]
                        p, ind = p.sort(descending=True)
                        indices = indices[ind]
                        result = split2(p, indices, k)
                        if result is None:
                            f.append((indices, k))
                        else:
                            for _ in range(len(result)):
                                queue.append((torch.LongTensor(result[_][1]), result[_][2]))

                    q = prob.clone()
                    q[:] = 0
                    for ind, k in f:
                        q[ind] = prob[ind] / prob[ind].sum() * 0.5**k

                    prob, indices = prob.sort(descending=True)
                    # start recursion
                    bit_tmp = 0
                    while prob[0] <= 0.5:
                        # embedding bit
                        bit = 1
                        while (1 / 2 ** (bit + 1)) > prob[0]:
                            bit += 1
                        mean = 1 / 2 ** bit
                        # dp
                        prob = prob.tolist()
                        indices = indices.tolist()
                        result = []
                        for i in range(2 ** bit):
                            result.append([[], []])
                        for i in range(2 ** bit - 1):
                            result[i][0].append(prob[0])
                            result[i][1].append(indices[0])
                            del (prob[0])
                            del (indices[0])
                            while sum(result[i][0]) < mean:
                                delta = mean - sum(result[i][0])
                                index = near(prob, delta)
                                if prob[index] - delta < delta:
                                    result[i][0].append(prob[index])
                                    result[i][1].append(indices[index])
                                    del (prob[index])
                                    del (indices[index])
                                else:
                                    break
                            mean = sum(prob) / (2 ** bit - i - 1)
                        result[2 ** bit - 1][0].extend(prob)
                        result[2 ** bit - 1][1].extend(indices)
                        # read secret message
                        bit_embed = [int(_) for _ in bit_stream[bit_index + bit_tmp:bit_index + bit_tmp + bit]]
                        int_embed = bits2int(bit_embed)
                        # updating
                        prob = torch.FloatTensor(result[int_embed][0]).to(device)
                        indices = torch.LongTensor(result[int_embed][1]).to(device)
                        prob = prob / prob.sum()
                        prob, _ = prob.sort(descending=True)
                        indices = indices[_]
                        bit_tmp += bit

                    # terminate
                    gen = int(indices[int(torch.multinomial(prob, 1))])
                    stega_sentence += [vocabulary.i2w[gen]]

                    if vocabulary.i2w[gen] == '_EOS':
                        break
                    x = torch.cat([x, torch.LongTensor([[gen]]).to(device)], dim=1).to(device)
                    stega_bit += bit_stream[bit_index:bit_index + bit_tmp]
                    bit_index += bit_tmp
                    all_kl.append(kl(q, q.log(), log_prob))

                # check
                if '_EOS' in stega_sentence:
                    stega_sentence.remove('_EOS')
                if (len(stega_sentence) <= MAX_LEN) and (len(stega_sentence) >= MIN_LEN):
                    stega_text.append(stega_sentence)
                    stega_bits.append(stega_bit)

            print(np.mean(all_kl))
예제 #14
0
def read_lm(src):
    return lm.LM(src)
예제 #15
0
def main():
    # ======================
    # hyper-parameters
    # ======================'
    CELL = "lstm"  # rnn, gru, lstm
    DATASET = 'tweet'  # movie, news, tweet
    WORD_DROP = 10
    MIN_LEN = 5
    MAX_LEN = 200
    EMBED_SIZE = 350
    HIDDEN_DIM = 512
    NUM_LAYERS = 2
    DROPOUT_RATE = 0.0
    MAX_GENERATE_LENGTH = 200
    GENERATE_NUM = 1000

    if DATASET == 'movie':
        LOAD_EPOCH = 6
    elif DATASET == 'news':
        LOAD_EPOCH = 10
    elif DATASET == 'tweet':
        LOAD_EPOCH = 6
    else:
        raise Exception

    all_var = locals()
    print()
    for var in all_var:
        if var != "var_name":
            print("{0:15}   ".format(var), all_var[var])
    print()

    # ======================
    # 数据
    # ======================
    data_path = 'data/' + DATASET + '2020.txt'
    vocabulary = utils.Vocabulary(data_path,
                                  max_len=MAX_LEN,
                                  min_len=MIN_LEN,
                                  word_drop=WORD_DROP)

    # ======================
    # building model
    # ======================
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = lm.LM(cell=CELL,
                  vocab_size=vocabulary.vocab_size,
                  embed_size=EMBED_SIZE,
                  hidden_dim=HIDDEN_DIM,
                  num_layers=NUM_LAYERS,
                  dropout_rate=DROPOUT_RATE)
    model.to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print("Total params: {:d}".format(total_params))
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print("Trainable params: {:d}".format(total_trainable_params))
    model.load_state_dict(
        torch.load('models/' + DATASET + '-' + str(LOAD_EPOCH) + '.pkl',
                   map_location=device))
    print('checkpoint loaded')
    print()

    # ======================
    # starting steganography
    # ======================
    os.makedirs('stego/' + DATASET, exist_ok=True)
    # read bit streams
    with open('bit_stream/bit_stream.txt', 'r', encoding='utf8') as f:
        bit_stream = f.read().strip()
        bit_stream += bit_stream
    bit_index = int(torch.randint(0, high=1000, size=(1, )))

    model.eval()
    with torch.no_grad():
        stega_text = []
        stega_bits = []
        while len(stega_text) < GENERATE_NUM:
            print(len(stega_text))
            stega_sentence = []
            stega_bit = ''
            x = torch.LongTensor([[vocabulary.w2i['_BOS']]]).to(device)
            samp = model.sample(x)
            stega_sentence.append(
                vocabulary.i2w[samp.reshape(-1).cpu().numpy()[0]])
            x = torch.cat([x, samp], dim=1)

            for i in range(MAX_GENERATE_LENGTH - 1):
                if '_EOS' in stega_sentence:
                    break
                # conditional probability distribution
                log_prob = model(x)
                prob = torch.exp(log_prob)[:, -1, :].reshape(-1)
                prob[1] = 0  # set unk to zero
                prob = prob / prob.sum()
                prob, indices = prob.sort(descending=True)
                # start recursion
                bit_tmp = 0
                while prob[0] <= 0.5:
                    # embedding bit
                    bit = 1
                    while (1 / 2**(bit + 1)) > prob[0]:
                        bit += 1
                    mean = 1 / 2**bit
                    # dp
                    prob = prob.tolist()
                    indices = indices.tolist()
                    result = []
                    for i in range(2**bit):
                        result.append([[], []])
                    for i in range(2**bit - 1):
                        result[i][0].append(prob[0])
                        result[i][1].append(indices[0])
                        del (prob[0])
                        del (indices[0])
                        while sum(result[i][0]) < mean:
                            delta = mean - sum(result[i][0])
                            index = near(prob, delta)
                            if prob[index] - delta < delta:
                                result[i][0].append(prob[index])
                                result[i][1].append(indices[index])
                                del (prob[index])
                                del (indices[index])
                            else:
                                break
                        mean = sum(prob) / (2**bit - i - 1)
                    result[2**bit - 1][0].extend(prob)
                    result[2**bit - 1][1].extend(indices)
                    # read secret message
                    bit_embed = [
                        int(_)
                        for _ in bit_stream[bit_index + bit_tmp:bit_index +
                                            bit_tmp + bit]
                    ]
                    int_embed = bits2int(bit_embed)
                    # updating
                    prob = torch.FloatTensor(result[int_embed][0]).to(device)
                    indices = torch.LongTensor(result[int_embed][1]).to(device)
                    prob = prob / prob.sum()
                    prob, _ = prob.sort(descending=True)
                    indices = indices[_]
                    bit_tmp += bit

                # terminate
                gen = int(indices[int(torch.multinomial(prob, 1))])
                stega_sentence += [vocabulary.i2w[gen]]

                if vocabulary.i2w[gen] == '_EOS':
                    break
                x = torch.cat([x, torch.LongTensor([[gen]]).to(device)],
                              dim=1).to(device)
                stega_bit += bit_stream[bit_index:bit_index + bit_tmp]
                bit_index += bit_tmp

            # check
            if '_EOS' in stega_sentence:
                stega_sentence.remove('_EOS')
            if (len(stega_sentence) <= MAX_LEN) and (len(stega_sentence) >=
                                                     MIN_LEN):
                stega_text.append(stega_sentence)
                stega_bits.append(stega_bit)

        # write files
        with open('stego/' + DATASET + '/adg.txt', 'w', encoding='utf8') as f:
            for sentence in stega_text:
                f.write(' '.join(sentence) + '\n')
        with open('stego/' + DATASET + '/adg.bit', 'w', encoding='utf8') as f:
            for bits in stega_bits:
                f.write(bits + '\n')
예제 #16
0
 def load(self):
     self.ds = pickle.load(open(self.datasetPath, 'rb'))
     self.lm = L.LM(
         len(self.ds.char2id) + 2, self.ds.id2char, self.charVecPath,
         self.uniProbDictPath)
     serializers.load_npz(self.modelPath, self.lm)