def load_data(self): if len(self.data_dir) == 0: raise Exception( 'Please specify path to data directory in gan_language.py!') # Load data self.lines, self.charmap, self.inv_charmap = language_helpers.load_dataset( max_length=self.seq_len, max_n_examples=self.max_n_examples, data_dir=self.data_dir)
def load_dataset(b_lines=True, b_charmap=True, b_inv_charmap=True, seq_length=32, n_examples=10000000, tokenize=False, pad=True, dataset='training'): LINES_FN = 'lines_%s_%s.pkl' % (seq_length, tokenize) if dataset != 'training': LINES_FN = dataset + '_' + LINES_FN LINES_PKL_PATH = PICKLE_PATH + '/' + LINES_FN if PICKLE_PATH is not None and PICKLE_LOAD is True and ( b_lines is False or (b_lines and os.path.exists(LINES_PKL_PATH))) \ and (b_charmap is False or (b_charmap and os.path.exists(CHARMAP_PKL_PATH))) and \ (b_inv_charmap is False or (b_inv_charmap and os.path.exists(INV_CHARMAP_PKL_PATH))): print("Loading lines, charmap, inv_charmap from pickle files") lines, charmap, inv_charmap = load_dataset_from_pkl( b_lines=b_lines, b_charmap=b_charmap, b_inv_charmap=b_inv_charmap, lines_pkl_path=LINES_PKL_PATH) else: print( "Loading lines, charmap, inv_charmap from Dataset & Saving to pickle" ) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=seq_length, max_n_examples=n_examples, data_dir=DATA_DIR, tokenize=tokenize, pad=pad, dataset=dataset) # save to pkl if not os.path.isdir(PICKLE_PATH): os.mkdir(PICKLE_PATH) if b_lines: save_picklized(lines, LINES_PKL_PATH) if b_charmap: save_picklized(charmap, CHARMAP_PKL_PATH) if b_inv_charmap: save_picklized(inv_charmap, INV_CHARMAP_PKL_PATH) return lines, charmap, inv_charmap
def main(): data_dir = './data/1-billion-word-language-modeling-benchmark-r13output/' batch_size = 64 iters = 100000 seq_len = 32 dim = 512 critic_iters = 5 gp_scale = 10 max_n_examples = 10000000 sn = False lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=seq_len, max_n_examples=max_n_examples, data_dir=data_dir) dgen = inf_train_gen(lines, charmap, batch_size) G = networks.TextGenerator(len(charmap)) D = networks.TextDiscriminator(len(charmap), sn=sn) ngrams = [4] true_char_ngram_lms = [ language_helpers.NgramLanguageModel(n, lines[10 * batch_size:], tokenize=False) for n in ngrams ] validation_char_ngram_lms = [ language_helpers.NgramLanguageModel(n, lines[:10 * batch_size], tokenize=False) for n in ngrams ] for i, n in enumerate(ngrams): print("validation set JSD for n={}: {:.4f}".format( n, true_char_ngram_lms[i].js_with(validation_char_ngram_lms[i]))) true_char_ngram_lms = [ language_helpers.NgramLanguageModel(n, lines, tokenize=False) for n in ngrams ] train(G, D, dgen, iters, seq_len, batch_size, critic_iters, len(charmap), inv_charmap, ngrams, true_char_ngram_lms)
SEQ_LEN = 50 # Sequence length in characters DIM = 512 # Model dimensionality. This is fairly slow and overfits, even on # Billion Word. Consider decreasing for smaller datasets. CRITIC_ITERS = 5 # How many critic iterations per generator iteration. We # use 10 for the results in the paper, but 5 should work fine # as well. LAMBDA = 10 # Gradient penalty lambda hyperparameter. MAX_N_EXAMPLES = 14098 # Max number of data examples to load. If data loading # is too slow or takes too much RAM, you can decrease # this (at the expense of having less training data). lib.print_model_settings(locals().copy()) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, data_dir=DATA_DIR ) def softmax(logits): return tf.reshape( tf.nn.softmax( tf.reshape(logits, [-1, len(charmap)]) ), tf.shape(logits) ) def make_noise(shape): return tf.random_normal(shape) def ResBlock(name, inputs):
DIM = 512 # Model dimensionality. This is fairly slow and overfits, even on # Billion Word. Consider decreasing for smaller datasets. CRITIC_ITERS = 10 # How many critic iterations per generator iteration. We # use 10 for the results in the paper, but 5 should work fine # as well. LAMBDA = 10 # Gradient penalty lambda hyperparameter. MAX_N_EXAMPLES = 10000000#10000000 # Max number of data examples to load. If data loading # is too slow or takes too much RAM, you can decrease # this (at the expense of having less training data). lib.print_model_settings(locals().copy()) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, data_dir=DATA_DIR ) table = np.arange(len(charmap)).reshape(-1, 1) one_hot = OneHotEncoder() one_hot.fit(table) # ==================Definition Start====================== def make_noise(shape, volatile=False): tensor = torch.randn(shape).cuda(gpu) if use_cuda else torch.randn(shape) return autograd.Variable(tensor, volatile) class ResBlock(nn.Module):
#SEQ_LEN = 32 # Sequence length in characters SEQ_LEN = 44 # Sequence length in characters DIM = 512 # Model dimensionality. This is fairly slow and overfits, even on # Billion Word. Consider decreasing for smaller datasets. CRITIC_ITERS = 10 # How many critic iterations per generator iteration. We # use 10 for the results in the paper, but 5 should work fine # as well. LAMBDA = 10 # Gradient penalty lambda hyperparameter. MAX_N_EXAMPLES = 10000000 # Max number of data examples to load. If data loading # is too slow or takes too much RAM, you can decrease # this (at the expense of having less training data). lib.print_model_settings(locals().copy()) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, data_file=args.input_text) def softmax(logits): return tf.reshape(tf.nn.softmax(tf.reshape(logits, [-1, len(charmap)])), tf.shape(logits)) def make_noise(shape): return tf.random_normal(shape) def ResBlock(name, inputs): output = inputs output = tf.nn.relu(output)
def load_dataset(b_lines=True, b_charmap=True, b_inv_charmap=True, seq_length=32, n_examples=10000000, tokenize=False, pad=True, dataset='training'): LINES_FN = 'lines_%s_%s.pkl' % (seq_length, tokenize) if dataset != 'training': LINES_FN = dataset + '_' + LINES_FN LINES_PKL_PATH = PICKLE_PATH + '/' + LINES_FN if PICKLE_PATH is not None and PICKLE_LOAD is True and ( b_lines is False or (b_lines and os.path.exists(LINES_PKL_PATH))) \ and (b_charmap is False or (b_charmap and os.path.exists(CHARMAP_PKL_PATH))) and \ (b_inv_charmap is False or (b_inv_charmap and os.path.exists(INV_CHARMAP_PKL_PATH))): print("Loading lines, charmap, inv_charmap from pickle files") lines, charmap, inv_charmap = load_dataset_from_pkl( b_lines=b_lines, b_charmap=b_charmap, b_inv_charmap=b_inv_charmap, lines_pkl_path=LINES_PKL_PATH) else: print( "Loading lines, charmap, inv_charmap from Dataset & Saving to pickle" ) dataset_name = os.path.basename(DATA_DIR) if dataset_name.startswith( '1-billion-word-language-modeling-benchmark'): lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=seq_length, max_n_examples=n_examples, data_dir=DATA_DIR, tokenize=tokenize, pad=pad, dataset=dataset) elif dataset_name.startswith('text8'): lines, charmap, inv_charmap = language_helpers.load_dataset_text8( max_length=seq_length, max_n_examples=n_examples, data_dir=DATA_DIR, tokenize=tokenize, pad=pad, dataset=dataset) else: raise TypeError( "currently supporting {1-billion-word-language-modeling-benchmark,text8}" ) # save to pkl if not os.path.isdir(PICKLE_PATH): os.mkdir(PICKLE_PATH) if b_lines: save_picklized(lines, LINES_PKL_PATH) if b_charmap: save_picklized(charmap, CHARMAP_PKL_PATH) if b_inv_charmap: save_picklized(inv_charmap, INV_CHARMAP_PKL_PATH) return lines, charmap, inv_charmap
action='store_true', help='decay the learning rate if no improvement seen.') opt = parser.parse_args() DIM = opt.dim if opt.dim else 64 # Model dimensionality. This is fairly slow and overfits, even on # Billion Word. Consider decreasing for smaller datasets. BATCH_SIZE = opt.batch_size if opt.batch_size else 64 # Batch size SEQ_LEN = opt.seq_len if opt.seq_len else 20 # Sequence length in characters VOCAB_SIZE = opt.vocab_size if opt.vocab_size else 4096 # Vocabulary size NUM_TONES = opt.num_tones if opt.num_tones else 5 lib.print_model_settings(locals().copy()) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, max_vocab_size=VOCAB_SIZE, data_dir=opt.data_dir) tones, tonemap, inv_tonemap = language_helpers.load_tones( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, max_vocab_size=NUM_TONES, data_dir=opt.tone_dir) with open(opt.char_info) as f: char2tone = json.load(f) char_tone_map = language_helpers.get_mask(charmap, tonemap, char2tone) #OUTPUT_SIZE = len(tonemap) + len(charmap)
def run(dim=512, critic_iters=10, seq_len=32, batch_size=64, iters=200000, penalty_weight=10, one_sided=True, max_n_examples=10000000, data_dir=''): # Download Google Billion Word at http://www.statmt.org/lm-benchmark/ and # fill in the path to the extracted files here! DATA_DIR = data_dir if len(DATA_DIR) == 0: raise Exception( 'Please specify path to data directory in gan_language.py!') BATCH_SIZE = batch_size # Batch size ITERS = iters # How many iterations to train for SEQ_LEN = seq_len # Sequence length in characters DIM = dim # Model dimensionality. This is fairly slow and overfits, even on # Billion Word. Consider decreasing for smaller datasets. CRITIC_ITERS = critic_iters # How many critic iterations per generator iteration. We # use 10 for the results in the paper, but 5 should work fine # as well. LAMBDA = penalty_weight # Gradient penalty lambda hyperparameter. MAX_N_EXAMPLES = max_n_examples # Max number of data examples to load. If data loading # is too slow or takes too much RAM, you can decrease # this (at the expense of having less training data). ONE_SIDED = one_sided lib.print_model_settings(locals().copy()) lines, charmap, inv_charmap = language_helpers.load_dataset( max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, data_dir=DATA_DIR) def softmax(logits): return tf.reshape( tf.nn.softmax(tf.reshape(logits, [-1, len(charmap)])), tf.shape(logits)) def make_noise(shape): return tf.random_normal(shape) def ResBlock(name, inputs): output = inputs output = tf.nn.relu(output) output = lib.ops.conv1d.Conv1D(name + '.1', DIM, DIM, 5, output) output = tf.nn.relu(output) output = lib.ops.conv1d.Conv1D(name + '.2', DIM, DIM, 5, output) return inputs + (0.3 * output) def Generator(n_samples, prev_outputs=None): output = make_noise(shape=[n_samples, 128]) output = lib.ops.linear.Linear('Generator.Input', 128, SEQ_LEN * DIM, output) output = tf.reshape(output, [-1, DIM, SEQ_LEN]) output = ResBlock('Generator.1', output) output = ResBlock('Generator.2', output) output = ResBlock('Generator.3', output) output = ResBlock('Generator.4', output) output = ResBlock('Generator.5', output) output = lib.ops.conv1d.Conv1D('Generator.Output', DIM, len(charmap), 1, output) output = tf.transpose(output, [0, 2, 1]) output = softmax(output) return output def Discriminator(inputs): output = tf.transpose(inputs, [0, 2, 1]) output = lib.ops.conv1d.Conv1D('Discriminator.Input', len(charmap), DIM, 1, output) output = ResBlock('Discriminator.1', output) output = ResBlock('Discriminator.2', output) output = ResBlock('Discriminator.3', output) output = ResBlock('Discriminator.4', output) output = ResBlock('Discriminator.5', output) output = tf.reshape(output, [-1, SEQ_LEN * DIM]) output = lib.ops.linear.Linear('Discriminator.Output', SEQ_LEN * DIM, 1, output) return output real_inputs_discrete = tf.placeholder(tf.int32, shape=[BATCH_SIZE, SEQ_LEN]) real_inputs = tf.one_hot(real_inputs_discrete, len(charmap)) fake_inputs = Generator(BATCH_SIZE) fake_inputs_discrete = tf.argmax(fake_inputs, fake_inputs.get_shape().ndims - 1) disc_real = Discriminator(real_inputs) disc_fake = Discriminator(fake_inputs) disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real) gen_cost = -tf.reduce_mean(disc_fake) # WGAN lipschitz-penalty alpha = tf.random_uniform(shape=[BATCH_SIZE, 1, 1], minval=0., maxval=1.) differences = fake_inputs - real_inputs interpolates = real_inputs + (alpha * differences) gradients = tf.gradients(Discriminator(interpolates), [interpolates])[0] slopes = tf.sqrt( tf.reduce_sum(tf.square(gradients), reduction_indices=[1, 2])) if not ONE_SIDED: gradient_penalty = tf.reduce_mean((slopes - 1.)**2) else: gradient_penalty = tf.reduce_mean( tf.clip_by_value(slopes - 1., 0., np.infty)**2) disc_cost += LAMBDA * gradient_penalty gen_params = lib.params_with_name('Generator') disc_params = lib.params_with_name('Discriminator') gen_train_op = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9).minimize( gen_cost, var_list=gen_params) disc_train_op = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9).minimize( disc_cost, var_list=disc_params) # Dataset iterator def inf_train_gen(): while True: np.random.shuffle(lines) for i in xrange(0, len(lines) - BATCH_SIZE + 1, BATCH_SIZE): yield np.array([[charmap[c] for c in l] for l in lines[i:i + BATCH_SIZE]], dtype='int32') # During training we monitor JS divergence between the true & generated ngram # distributions for n=1,2,3,4. To get an idea of the optimal values, we # evaluate these statistics on a held-out set first. true_char_ngram_lms = [ language_helpers.NgramLanguageModel(i + 1, lines[10 * BATCH_SIZE:], tokenize=False) for i in xrange(4) ] validation_char_ngram_lms = [ language_helpers.NgramLanguageModel(i + 1, lines[:10 * BATCH_SIZE], tokenize=False) for i in xrange(4) ] for i in xrange(4): print "validation set JSD for n={}: {}".format( i + 1, true_char_ngram_lms[i].js_with(validation_char_ngram_lms[i])) true_char_ngram_lms = [ language_helpers.NgramLanguageModel(i + 1, lines, tokenize=False) for i in xrange(4) ] with tf.Session() as session: session.run(tf.initialize_all_variables()) def generate_samples(): samples = session.run(fake_inputs) samples = np.argmax(samples, axis=2) decoded_samples = [] for i in xrange(len(samples)): decoded = [] for j in xrange(len(samples[i])): decoded.append(inv_charmap[samples[i][j]]) decoded_samples.append(tuple(decoded)) return decoded_samples gen = inf_train_gen() for iteration in xrange(ITERS): start_time = time.time() # Train generator if iteration > 0: _ = session.run(gen_train_op) # Train critic for i in xrange(CRITIC_ITERS): _data = gen.next() _disc_cost, _ = session.run( [disc_cost, disc_train_op], feed_dict={real_inputs_discrete: _data}) lib.plot.plot('time', time.time() - start_time) lib.plot.plot('train disc cost', _disc_cost) if iteration % 100 == 99: samples = [] for i in xrange(10): samples.extend(generate_samples()) for i in xrange(4): lm = language_helpers.NgramLanguageModel(i + 1, samples, tokenize=False) lib.plot.plot('js{}'.format(i + 1), lm.js_with(true_char_ngram_lms[i])) with open('samples_{}.txt'.format(iteration), 'w') as f: for s in samples: s = "".join(s) f.write(s + "\n") if iteration % 100 == 99: lib.plot.flush() lib.plot.tick()