def load_data(self):
        if len(self.data_dir) == 0:
            raise Exception(
                'Please specify path to data directory in gan_language.py!')

        # Load data
        self.lines, self.charmap, self.inv_charmap = language_helpers.load_dataset(
            max_length=self.seq_len,
            max_n_examples=self.max_n_examples,
            data_dir=self.data_dir)
Пример #2
0
def load_dataset(b_lines=True,
                 b_charmap=True,
                 b_inv_charmap=True,
                 seq_length=32,
                 n_examples=10000000,
                 tokenize=False,
                 pad=True,
                 dataset='training'):
    LINES_FN = 'lines_%s_%s.pkl' % (seq_length, tokenize)
    if dataset != 'training':
        LINES_FN = dataset + '_' + LINES_FN
    LINES_PKL_PATH = PICKLE_PATH + '/' + LINES_FN

    if PICKLE_PATH is not None and PICKLE_LOAD is True and (
                    b_lines is False or (b_lines and os.path.exists(LINES_PKL_PATH))) \
            and (b_charmap is False or (b_charmap and os.path.exists(CHARMAP_PKL_PATH))) and \
            (b_inv_charmap is False or (b_inv_charmap and os.path.exists(INV_CHARMAP_PKL_PATH))):

        print("Loading lines, charmap, inv_charmap from pickle files")
        lines, charmap, inv_charmap = load_dataset_from_pkl(
            b_lines=b_lines,
            b_charmap=b_charmap,
            b_inv_charmap=b_inv_charmap,
            lines_pkl_path=LINES_PKL_PATH)

    else:
        print(
            "Loading lines, charmap, inv_charmap from Dataset & Saving to pickle"
        )
        lines, charmap, inv_charmap = language_helpers.load_dataset(
            max_length=seq_length,
            max_n_examples=n_examples,
            data_dir=DATA_DIR,
            tokenize=tokenize,
            pad=pad,
            dataset=dataset)

        # save to pkl
        if not os.path.isdir(PICKLE_PATH):
            os.mkdir(PICKLE_PATH)

        if b_lines:
            save_picklized(lines, LINES_PKL_PATH)
        if b_charmap:
            save_picklized(charmap, CHARMAP_PKL_PATH)
        if b_inv_charmap:
            save_picklized(inv_charmap, INV_CHARMAP_PKL_PATH)

    return lines, charmap, inv_charmap
Пример #3
0
def main():
    data_dir = './data/1-billion-word-language-modeling-benchmark-r13output/'
    batch_size = 64
    iters = 100000
    seq_len = 32
    dim = 512
    critic_iters = 5
    gp_scale = 10
    max_n_examples = 10000000
    sn = False

    lines, charmap, inv_charmap = language_helpers.load_dataset(
        max_length=seq_len, max_n_examples=max_n_examples, data_dir=data_dir)
    dgen = inf_train_gen(lines, charmap, batch_size)
    G = networks.TextGenerator(len(charmap))
    D = networks.TextDiscriminator(len(charmap), sn=sn)
    ngrams = [4]
    true_char_ngram_lms = [
        language_helpers.NgramLanguageModel(n,
                                            lines[10 * batch_size:],
                                            tokenize=False) for n in ngrams
    ]
    validation_char_ngram_lms = [
        language_helpers.NgramLanguageModel(n,
                                            lines[:10 * batch_size],
                                            tokenize=False) for n in ngrams
    ]
    for i, n in enumerate(ngrams):
        print("validation set JSD for n={}: {:.4f}".format(
            n, true_char_ngram_lms[i].js_with(validation_char_ngram_lms[i])))
    true_char_ngram_lms = [
        language_helpers.NgramLanguageModel(n, lines, tokenize=False)
        for n in ngrams
    ]
    train(G, D, dgen, iters, seq_len, batch_size, critic_iters, len(charmap),
          inv_charmap, ngrams, true_char_ngram_lms)
Пример #4
0
SEQ_LEN = 50 # Sequence length in characters
DIM = 512 # Model dimensionality. This is fairly slow and overfits, even on
          # Billion Word. Consider decreasing for smaller datasets.
CRITIC_ITERS = 5 # How many critic iterations per generator iteration. We
                  # use 10 for the results in the paper, but 5 should work fine
                  # as well.
LAMBDA = 10 # Gradient penalty lambda hyperparameter.
MAX_N_EXAMPLES = 14098 # Max number of data examples to load. If data loading
                          # is too slow or takes too much RAM, you can decrease
                          # this (at the expense of having less training data).

lib.print_model_settings(locals().copy())

lines, charmap, inv_charmap = language_helpers.load_dataset(
    max_length=SEQ_LEN,
    max_n_examples=MAX_N_EXAMPLES,
    data_dir=DATA_DIR
)

def softmax(logits):
    return tf.reshape(
        tf.nn.softmax(
            tf.reshape(logits, [-1, len(charmap)])
        ),
        tf.shape(logits)
    )

def make_noise(shape):
    return tf.random_normal(shape)

def ResBlock(name, inputs):
Пример #5
0
DIM = 512 # Model dimensionality. This is fairly slow and overfits, even on
          # Billion Word. Consider decreasing for smaller datasets.
CRITIC_ITERS = 10 # How many critic iterations per generator iteration. We
                  # use 10 for the results in the paper, but 5 should work fine
                  # as well.
LAMBDA = 10 # Gradient penalty lambda hyperparameter.
MAX_N_EXAMPLES = 10000000#10000000 # Max number of data examples to load. If data loading
                          # is too slow or takes too much RAM, you can decrease
                          # this (at the expense of having less training data).


lib.print_model_settings(locals().copy())

lines, charmap, inv_charmap = language_helpers.load_dataset(
    max_length=SEQ_LEN,
    max_n_examples=MAX_N_EXAMPLES,
    data_dir=DATA_DIR
)

table = np.arange(len(charmap)).reshape(-1, 1)
one_hot = OneHotEncoder()
one_hot.fit(table)

# ==================Definition Start======================

def make_noise(shape, volatile=False):
    tensor = torch.randn(shape).cuda(gpu) if use_cuda else torch.randn(shape)
    return autograd.Variable(tensor, volatile)

class ResBlock(nn.Module):
#SEQ_LEN = 32 # Sequence length in characters
SEQ_LEN = 44  # Sequence length in characters
DIM = 512  # Model dimensionality. This is fairly slow and overfits, even on
# Billion Word. Consider decreasing for smaller datasets.
CRITIC_ITERS = 10  # How many critic iterations per generator iteration. We
# use 10 for the results in the paper, but 5 should work fine
# as well.
LAMBDA = 10  # Gradient penalty lambda hyperparameter.
MAX_N_EXAMPLES = 10000000  # Max number of data examples to load. If data loading
# is too slow or takes too much RAM, you can decrease
# this (at the expense of having less training data).

lib.print_model_settings(locals().copy())

lines, charmap, inv_charmap = language_helpers.load_dataset(
    max_length=SEQ_LEN,
    max_n_examples=MAX_N_EXAMPLES,
    data_file=args.input_text)


def softmax(logits):
    return tf.reshape(tf.nn.softmax(tf.reshape(logits, [-1, len(charmap)])),
                      tf.shape(logits))


def make_noise(shape):
    return tf.random_normal(shape)


def ResBlock(name, inputs):
    output = inputs
    output = tf.nn.relu(output)
Пример #7
0
def load_dataset(b_lines=True,
                 b_charmap=True,
                 b_inv_charmap=True,
                 seq_length=32,
                 n_examples=10000000,
                 tokenize=False,
                 pad=True,
                 dataset='training'):
    LINES_FN = 'lines_%s_%s.pkl' % (seq_length, tokenize)
    if dataset != 'training':
        LINES_FN = dataset + '_' + LINES_FN
    LINES_PKL_PATH = PICKLE_PATH + '/' + LINES_FN

    if PICKLE_PATH is not None and PICKLE_LOAD is True and (
                    b_lines is False or (b_lines and os.path.exists(LINES_PKL_PATH))) \
            and (b_charmap is False or (b_charmap and os.path.exists(CHARMAP_PKL_PATH))) and \
            (b_inv_charmap is False or (b_inv_charmap and os.path.exists(INV_CHARMAP_PKL_PATH))):

        print("Loading lines, charmap, inv_charmap from pickle files")
        lines, charmap, inv_charmap = load_dataset_from_pkl(
            b_lines=b_lines,
            b_charmap=b_charmap,
            b_inv_charmap=b_inv_charmap,
            lines_pkl_path=LINES_PKL_PATH)

    else:
        print(
            "Loading lines, charmap, inv_charmap from Dataset & Saving to pickle"
        )

        dataset_name = os.path.basename(DATA_DIR)

        if dataset_name.startswith(
                '1-billion-word-language-modeling-benchmark'):
            lines, charmap, inv_charmap = language_helpers.load_dataset(
                max_length=seq_length,
                max_n_examples=n_examples,
                data_dir=DATA_DIR,
                tokenize=tokenize,
                pad=pad,
                dataset=dataset)
        elif dataset_name.startswith('text8'):
            lines, charmap, inv_charmap = language_helpers.load_dataset_text8(
                max_length=seq_length,
                max_n_examples=n_examples,
                data_dir=DATA_DIR,
                tokenize=tokenize,
                pad=pad,
                dataset=dataset)
        else:
            raise TypeError(
                "currently supporting {1-billion-word-language-modeling-benchmark,text8}"
            )

        # save to pkl
        if not os.path.isdir(PICKLE_PATH):
            os.mkdir(PICKLE_PATH)

        if b_lines:
            save_picklized(lines, LINES_PKL_PATH)
        if b_charmap:
            save_picklized(charmap, CHARMAP_PKL_PATH)
        if b_inv_charmap:
            save_picklized(inv_charmap, INV_CHARMAP_PKL_PATH)

    return lines, charmap, inv_charmap
Пример #8
0
                    action='store_true',
                    help='decay the learning rate if no improvement seen.')
opt = parser.parse_args()
DIM = opt.dim if opt.dim else 64
# Model dimensionality. This is fairly slow and overfits, even on
# Billion Word. Consider decreasing for smaller datasets.
BATCH_SIZE = opt.batch_size if opt.batch_size else 64  # Batch size
SEQ_LEN = opt.seq_len if opt.seq_len else 20  # Sequence length in characters
VOCAB_SIZE = opt.vocab_size if opt.vocab_size else 4096  #  Vocabulary size
NUM_TONES = opt.num_tones if opt.num_tones else 5

lib.print_model_settings(locals().copy())

lines, charmap, inv_charmap = language_helpers.load_dataset(
    max_length=SEQ_LEN,
    max_n_examples=MAX_N_EXAMPLES,
    max_vocab_size=VOCAB_SIZE,
    data_dir=opt.data_dir)

tones, tonemap, inv_tonemap = language_helpers.load_tones(
    max_length=SEQ_LEN,
    max_n_examples=MAX_N_EXAMPLES,
    max_vocab_size=NUM_TONES,
    data_dir=opt.tone_dir)

with open(opt.char_info) as f:
    char2tone = json.load(f)

char_tone_map = language_helpers.get_mask(charmap, tonemap, char2tone)

#OUTPUT_SIZE = len(tonemap) + len(charmap)
def run(dim=512,
        critic_iters=10,
        seq_len=32,
        batch_size=64,
        iters=200000,
        penalty_weight=10,
        one_sided=True,
        max_n_examples=10000000,
        data_dir=''):
    # Download Google Billion Word at http://www.statmt.org/lm-benchmark/ and
    # fill in the path to the extracted files here!
    DATA_DIR = data_dir
    if len(DATA_DIR) == 0:
        raise Exception(
            'Please specify path to data directory in gan_language.py!')

    BATCH_SIZE = batch_size  # Batch size
    ITERS = iters  # How many iterations to train for
    SEQ_LEN = seq_len  # Sequence length in characters
    DIM = dim  # Model dimensionality. This is fairly slow and overfits, even on
    # Billion Word. Consider decreasing for smaller datasets.
    CRITIC_ITERS = critic_iters  # How many critic iterations per generator iteration. We
    # use 10 for the results in the paper, but 5 should work fine
    # as well.
    LAMBDA = penalty_weight  # Gradient penalty lambda hyperparameter.
    MAX_N_EXAMPLES = max_n_examples  # Max number of data examples to load. If data loading
    # is too slow or takes too much RAM, you can decrease
    # this (at the expense of having less training data).

    ONE_SIDED = one_sided

    lib.print_model_settings(locals().copy())

    lines, charmap, inv_charmap = language_helpers.load_dataset(
        max_length=SEQ_LEN, max_n_examples=MAX_N_EXAMPLES, data_dir=DATA_DIR)

    def softmax(logits):
        return tf.reshape(
            tf.nn.softmax(tf.reshape(logits, [-1, len(charmap)])),
            tf.shape(logits))

    def make_noise(shape):
        return tf.random_normal(shape)

    def ResBlock(name, inputs):
        output = inputs
        output = tf.nn.relu(output)
        output = lib.ops.conv1d.Conv1D(name + '.1', DIM, DIM, 5, output)
        output = tf.nn.relu(output)
        output = lib.ops.conv1d.Conv1D(name + '.2', DIM, DIM, 5, output)
        return inputs + (0.3 * output)

    def Generator(n_samples, prev_outputs=None):
        output = make_noise(shape=[n_samples, 128])
        output = lib.ops.linear.Linear('Generator.Input', 128, SEQ_LEN * DIM,
                                       output)
        output = tf.reshape(output, [-1, DIM, SEQ_LEN])
        output = ResBlock('Generator.1', output)
        output = ResBlock('Generator.2', output)
        output = ResBlock('Generator.3', output)
        output = ResBlock('Generator.4', output)
        output = ResBlock('Generator.5', output)
        output = lib.ops.conv1d.Conv1D('Generator.Output', DIM, len(charmap),
                                       1, output)
        output = tf.transpose(output, [0, 2, 1])
        output = softmax(output)
        return output

    def Discriminator(inputs):
        output = tf.transpose(inputs, [0, 2, 1])
        output = lib.ops.conv1d.Conv1D('Discriminator.Input', len(charmap),
                                       DIM, 1, output)
        output = ResBlock('Discriminator.1', output)
        output = ResBlock('Discriminator.2', output)
        output = ResBlock('Discriminator.3', output)
        output = ResBlock('Discriminator.4', output)
        output = ResBlock('Discriminator.5', output)
        output = tf.reshape(output, [-1, SEQ_LEN * DIM])
        output = lib.ops.linear.Linear('Discriminator.Output', SEQ_LEN * DIM,
                                       1, output)
        return output

    real_inputs_discrete = tf.placeholder(tf.int32,
                                          shape=[BATCH_SIZE, SEQ_LEN])
    real_inputs = tf.one_hot(real_inputs_discrete, len(charmap))
    fake_inputs = Generator(BATCH_SIZE)
    fake_inputs_discrete = tf.argmax(fake_inputs,
                                     fake_inputs.get_shape().ndims - 1)

    disc_real = Discriminator(real_inputs)
    disc_fake = Discriminator(fake_inputs)

    disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
    gen_cost = -tf.reduce_mean(disc_fake)

    # WGAN lipschitz-penalty
    alpha = tf.random_uniform(shape=[BATCH_SIZE, 1, 1], minval=0., maxval=1.)
    differences = fake_inputs - real_inputs
    interpolates = real_inputs + (alpha * differences)
    gradients = tf.gradients(Discriminator(interpolates), [interpolates])[0]
    slopes = tf.sqrt(
        tf.reduce_sum(tf.square(gradients), reduction_indices=[1, 2]))
    if not ONE_SIDED:
        gradient_penalty = tf.reduce_mean((slopes - 1.)**2)
    else:
        gradient_penalty = tf.reduce_mean(
            tf.clip_by_value(slopes - 1., 0., np.infty)**2)
    disc_cost += LAMBDA * gradient_penalty

    gen_params = lib.params_with_name('Generator')
    disc_params = lib.params_with_name('Discriminator')

    gen_train_op = tf.train.AdamOptimizer(learning_rate=1e-4,
                                          beta1=0.5,
                                          beta2=0.9).minimize(
                                              gen_cost, var_list=gen_params)
    disc_train_op = tf.train.AdamOptimizer(learning_rate=1e-4,
                                           beta1=0.5,
                                           beta2=0.9).minimize(
                                               disc_cost, var_list=disc_params)

    # Dataset iterator
    def inf_train_gen():
        while True:
            np.random.shuffle(lines)
            for i in xrange(0, len(lines) - BATCH_SIZE + 1, BATCH_SIZE):
                yield np.array([[charmap[c] for c in l]
                                for l in lines[i:i + BATCH_SIZE]],
                               dtype='int32')

    # During training we monitor JS divergence between the true & generated ngram
    # distributions for n=1,2,3,4. To get an idea of the optimal values, we
    # evaluate these statistics on a held-out set first.
    true_char_ngram_lms = [
        language_helpers.NgramLanguageModel(i + 1,
                                            lines[10 * BATCH_SIZE:],
                                            tokenize=False) for i in xrange(4)
    ]
    validation_char_ngram_lms = [
        language_helpers.NgramLanguageModel(i + 1,
                                            lines[:10 * BATCH_SIZE],
                                            tokenize=False) for i in xrange(4)
    ]
    for i in xrange(4):
        print "validation set JSD for n={}: {}".format(
            i + 1,
            true_char_ngram_lms[i].js_with(validation_char_ngram_lms[i]))
    true_char_ngram_lms = [
        language_helpers.NgramLanguageModel(i + 1, lines, tokenize=False)
        for i in xrange(4)
    ]

    with tf.Session() as session:

        session.run(tf.initialize_all_variables())

        def generate_samples():
            samples = session.run(fake_inputs)
            samples = np.argmax(samples, axis=2)
            decoded_samples = []
            for i in xrange(len(samples)):
                decoded = []
                for j in xrange(len(samples[i])):
                    decoded.append(inv_charmap[samples[i][j]])
                decoded_samples.append(tuple(decoded))
            return decoded_samples

        gen = inf_train_gen()

        for iteration in xrange(ITERS):
            start_time = time.time()

            # Train generator
            if iteration > 0:
                _ = session.run(gen_train_op)

            # Train critic
            for i in xrange(CRITIC_ITERS):
                _data = gen.next()
                _disc_cost, _ = session.run(
                    [disc_cost, disc_train_op],
                    feed_dict={real_inputs_discrete: _data})

            lib.plot.plot('time', time.time() - start_time)
            lib.plot.plot('train disc cost', _disc_cost)

            if iteration % 100 == 99:
                samples = []
                for i in xrange(10):
                    samples.extend(generate_samples())

                for i in xrange(4):
                    lm = language_helpers.NgramLanguageModel(i + 1,
                                                             samples,
                                                             tokenize=False)
                    lib.plot.plot('js{}'.format(i + 1),
                                  lm.js_with(true_char_ngram_lms[i]))

                with open('samples_{}.txt'.format(iteration), 'w') as f:
                    for s in samples:
                        s = "".join(s)
                        f.write(s + "\n")

            if iteration % 100 == 99:
                lib.plot.flush()

            lib.plot.tick()