Exemplo n.º 1
0
def test(model, config, prompts):

    sr = 24000 if 'blizzard' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = audio.r
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    with tf.device('/cpu:0'):
        batch_inputs = data_input.load_prompts(prompts, ivocab)
        config.num_prompts = len(prompts)

    with tf.Session() as sess:

        stft_mean = tf.get_variable('stft_mean', shape=(1025*audio.r,), dtype=tf.float16)
        stft_std = tf.get_variable('stft_std', shape=(1025*audio.r,), dtype=tf.float32)

        # initialize model
        model = model(config, batch_inputs, train=False)

        train_writer = tf.summary.FileWriter('log/' + config.save_path + '/test', sess.graph)

        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        saver = tf.train.Saver()

        print('restoring weights')
        latest_ckpt = tf.train.latest_checkpoint(
            'weights/' + config.save_path[:config.save_path.rfind('/')]
        )
        saver.restore(sess, latest_ckpt)

        stft_mean, stft_std = sess.run([stft_mean, stft_std])

        try:
            while(True):
                out = sess.run([
                    model.output,
                    model.alignments,
                    batch_inputs
                ])
                outputs, alignments, inputs = out

                print('saving samples')
                for out, words, align in zip(outputs, inputs['text'], alignments):
                    # store a sample to listen to
                    text = ''.join([ivocab[w] for w in words])
                    attention_plot = data_input.generate_attention_plot(align)
                    sample = audio.invert_spectrogram(out*stft_std + stft_mean)
                    merged = sess.run(tf.summary.merge(
                         [tf.summary.audio(text, sample[None, :], sr),
                          tf.summary.image(text, attention_plot)]
                    ))
                    train_writer.add_summary(merged, 0)
        except tf.errors.OutOfRangeError:
            coord.request_stop()
            coord.join(threads)
Exemplo n.º 2
0
def test(model, config, prompt_file):

    sr = 24000 if 'blizzard' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = audio.r
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    with tf.device('/cpu:0'):
        batch_inputs, config.num_prompts = data_input.load_prompts(
            prompt_file, ivocab)

    with tf.Session() as sess:
        stft_mean, stft_std = \
                np.load(config.data_path + 'stft_mean.npy'), np.load(config.data_path + 'stft_std.npy')

        # initialize model
        model = model(config, batch_inputs, train=False)

        train_writer = tf.summary.FileWriter(
            'log/' + config.save_path + '/test', sess.graph)

        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        saver = tf.train.Saver()

        print('restoring weights')
        latest_ckpt = tf.train.latest_checkpoint(
            'weights/' + config.save_path[:config.save_path.rfind('/')])
        saver.restore(sess, latest_ckpt)

        try:
            while (True):
                out = sess.run([model.output, model.alignments, batch_inputs])
                outputs, alignments, inputs = out

                print('saving samples')
                for out, words, align in zip(outputs, inputs['text'],
                                             alignments):
                    # store a sample to listen to
                    text = ''.join([ivocab[w] for w in words])
                    attention_plot = data_input.generate_attention_plot(align)
                    sample = audio.invert_spectrogram(out * stft_std +
                                                      stft_mean)
                    merged = sess.run(
                        tf.summary.merge([
                            tf.summary.audio(text, sample[None, :], sr),
                            tf.summary.image(text, attention_plot)
                        ]))
                    train_writer.add_summary(merged, 0)
        except tf.errors.OutOfRangeError:
            coord.request_stop()
            coord.join(threads)
Exemplo n.º 3
0
def train(model, config, num_steps=1000000):

    sr = 24000 if 'vctk' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = meta['r']
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    with tf.Session() as sess:

        inputs, names, num_speakers, stft_mean, stft_std = \
                data_input.load_from_npy(config.data_path)

        config.num_speakers = num_speakers

        # save the mean and std as tensorflow variables so they are saved with the weights
        tf.Variable(stft_mean, name='stft_mean')
        tf.Variable(stft_std, name='stft_std')

        batch_inputs = data_input.build_dataset(sess, inputs, names)

        # initialize model
        model = model(config, batch_inputs, train=True)


        train_writer = tf.summary.FileWriter('log/' + config.save_path + '/train', sess.graph)

        tf.global_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3)

        if config.restore:
            print('restoring weights')
            latest_ckpt = tf.train.latest_checkpoint(
                'weights/' + config.save_path[:config.save_path.rfind('/')]
            )
            if RESTORE_FROM is None:
                if latest_ckpt is not None:
                    saver.restore(sess, latest_ckpt)
            else:
                saver.restore(sess, 'weights/' + config.save_path + '-' + str(RESTORE_FROM))

        lr = model.config.init_lr
        annealing_rate = model.config.annealing_rate
        
        for _ in tqdm(range(num_steps)):
            out = sess.run([
                model.train_op,
                model.global_step,
                model.loss,
                model.output,
                model.alignments,
                model.merged,
                batch_inputs
                ], feed_dict={model.lr: lr})
            _, global_step, loss, output, alignments, summary, inputs = out

            train_writer.add_summary(summary, global_step)

            # detect gradient explosion
            if loss > 1e8 and global_step > 500:
                print('loss exploded')
                break

            if global_step % 1000 == 0:
                lr *= annealing_rate

            if global_step % SAVE_EVERY == 0 and global_step != 0:

                print('saving weights')
                if not os.path.exists('weights/' + config.save_path):
                    os.makedirs('weights/' + config.save_path)
                saver.save(sess, 'weights/' + config.save_path, global_step=global_step)

                print('saving sample')
                # store a sample to listen to
                ideal = audio.invert_spectrogram(inputs['stft'][0]*stft_std + stft_mean)
                sample = audio.invert_spectrogram(output[0]*stft_std + stft_mean)
                attention_plot = data_input.generate_attention_plot(alignments[0])
                step = '_' + str(global_step)
                merged = sess.run(tf.summary.merge(
                    [tf.summary.audio('ideal' + step, ideal[None, :], sr),
                     tf.summary.audio('sample' + step, sample[None, :], sr),
                     tf.summary.image('attention' + step, attention_plot)]
                ))
                train_writer.add_summary(merged, global_step)

        coord.request_stop()
        coord.join(threads)
Exemplo n.º 4
0
def train(model, config, num_steps=1000000):

    sr = 24000 if 'blizzard' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = meta['r']
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    with tf.Session() as sess:

        inputs, stft_mean, stft_std = data_input.load_from_npy(
            config.data_path)

        batch_inputs = data_input.build_dataset(sess, inputs)

        # initialize model
        model = model(config, batch_inputs, train=True)

        train_writer = tf.summary.FileWriter(
            'log/' + config.save_path + '/train', sess.graph)

        tf.global_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3)

        if config.restore:
            print('restoring weights')
            latest_ckpt = tf.train.latest_checkpoint(
                'weights/' + config.save_path[:config.save_path.rfind('/')])
            if RESTORE_FROM is None:
                saver.restore(sess, latest_ckpt)
            else:
                saver.restore(
                    sess,
                    'weights/' + config.save_path + '-' + str(RESTORE_FROM))

        lr = model.config.init_lr
        annealing_rate = model.config.annealing_rate

        for _ in tqdm(range(num_steps)):
            out = sess.run([
                model.train_op, model.global_step, model.loss, model.output,
                model.alignments, model.merged, batch_inputs
            ],
                           feed_dict={model.lr: lr})
            _, global_step, loss, output, alignments, summary, inputs = out

            train_writer.add_summary(summary, global_step)

            # detect gradient explosion
            if loss > 1e8 and global_step > 500:
                print('loss exploded')
                break

            if global_step % 1000 == 0:
                lr *= annealing_rate

            if global_step % SAVE_EVERY == 0 and global_step != 0:

                print('saving weights')
                if not os.path.exists('weights/' + config.save_path):
                    os.makedirs('weights/' + config.save_path)
                saver.save(sess,
                           'weights/' + config.save_path,
                           global_step=global_step)

                print('saving sample')
                # store a sample to listen to
                ideal = audio.invert_spectrogram(inputs['stft'][0] * stft_std +
                                                 stft_mean)
                sample = audio.invert_spectrogram(output[0] * stft_std +
                                                  stft_mean)
                attention_plot = data_input.generate_attention_plot(
                    alignments[0])
                step = '_' + str(global_step)
                merged = sess.run(
                    tf.summary.merge([
                        tf.summary.audio('ideal' + step, ideal[None, :], sr),
                        tf.summary.audio('sample' + step, sample[None, :], sr),
                        tf.summary.image('attention' + step, attention_plot)
                    ]))
                train_writer.add_summary(merged, global_step)

        coord.request_stop()
        coord.join(threads)
def train(model, config, num_steps=1000000):

    sr = 24000 if 'vctk' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = meta['r']
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    print("Sampling mean and std...")
    if args.hdf5:
        stft_mean, stft_std, mel_mean, mel_std = data_input.get_stft_and_mel_std_and_mean_from_table(
            os.path.join(config.data_path, "data"))
    else:
        stft_mean, stft_std, mel_mean, mel_std = data_input.get_stft_and_mel_std_and_mean_from_tfrecords(
            config.tf_record_files)
    print("Sampled mean and std!")

    print("Building dataset...")
    loader, reader, names, shapes, types = data_input.build_dataset_with_hdf5_table(
        os.path.join(config.data_path, "data"))
    print("Built dataset!")

    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    with tf.Session(config=config_proto) as sess:
        if args.hdf5:
            batch_inputs, stft_mean, stft_std = data_input.build_hdf5_dataset_from_table(
                os.path.join(config.data_path, "data"), sess, loader, names,
                shapes, types, ivocab, stft_mean, stft_std, mel_mean, mel_std)
        else:
            batch_inputs = data_input.build_tfrecord_dataset(
                config.tf_record_files, sess, names, ivocab, stft_mean,
                stft_std, mel_mean, mel_std)

        tf.Variable(stft_mean, name='stft_mean')
        tf.Variable(stft_std, name='stft_std')

        print("Initializing model...")
        # initialize model
        model = model(config, batch_inputs, train=True)
        print("Model initialized!")

        train_writer = tf.summary.FileWriter(
            'log/' + config.save_path + '/train', sess.graph)

        tf.global_variables_initializer().run()
        coord = tf.train.Coordinator()
        print("Starting queue runners...")
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        print("Started queue runners!")

        saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3)

        if config.restore:
            print('restoring weights')
            latest_ckpt = tf.train.latest_checkpoint(
                'weights/' + config.save_path[:config.save_path.rfind('/')])
            if RESTORE_FROM is None:
                if latest_ckpt is not None:
                    saver.restore(sess, latest_ckpt)
            else:
                saver.restore(
                    sess,
                    'weights/' + config.save_path + '-' + str(RESTORE_FROM))

        lr = model.config.init_lr
        annealing_rate = model.config.annealing_rate
        if config.restore:
            print("Restored global step: %s" %
                  str(model.global_step.eval(sess)))
            lr *= (annealing_rate**(model.global_step.eval(sess) //
                                    ANNEALING_STEPS))
            print("Recovered learning rate: '%s'" % str(lr))
        print("Using learning rate: '%s' and annealing rate: '%s'" %
              (lr, annealing_rate))

        print("Looping over num_steps: %s" % str(num_steps))
        with loader.begin(sess):
            for _ in tqdm(range(num_steps)):
                print("Running sess...")
                out = sess.run([
                    model.train_op, model.global_step, model.loss,
                    model.output, model.alignments, model.merged, batch_inputs
                ],
                               feed_dict={model.lr: lr})
                _, global_step, loss, output, alignments, summary, inputs = out
                print("Finished run: %d!" % global_step)

                train_writer.add_summary(summary, global_step)

                # detect gradient explosion
                if loss > 1e9 and global_step > 50000:
                    print('loss exploded')
                    break

                if global_step % ANNEALING_STEPS == 0:
                    old_lr = lr
                    lr *= annealing_rate
                    print("Updated learning rate from: %s to %s" %
                          (str(old_lr), str(lr)))

                if global_step % SAVE_EVERY == 0 and global_step != 0:

                    print('saving weights')
                    if not os.path.exists('weights/' + config.save_path):
                        os.makedirs('weights/' + config.save_path)
                    saver.save(sess,
                               'weights/' + config.save_path,
                               global_step=global_step)

                    print('saving sample')
                    print("stft shape: %s" % str(inputs['stft'][0].shape))
                    # store a sample to listen to
                    ideal = audio.invert_spectrogram(inputs['stft'][0] *
                                                     stft_std + stft_mean)
                    sample = audio.invert_spectrogram(output[0] * stft_std +
                                                      stft_mean)
                    attention_plot = data_input.generate_attention_plot(
                        alignments[0])
                    step = '_' + str(global_step) + '_'
                    # Remove pad words
                    text_string = texts = "".join(
                        filter(lambda x: x != "<pad>",
                               [ivocab[word] for word in inputs['text'][0]]))
                    # Remove unicode chars, replacing them with 0
                    text_string = "".join(
                        map(
                            lambda x: "0"
                            # This is the REGEX specified in name_scope in ops.py in tensorflow
                            if re.match("[A-Za-z0-9_.\\-/ ]", x) is None else
                            x,
                            text_string))
                    text_string = text_string.strip()
                    quoted_text_string = "\"" + text_string + "\""
                    print("ideal: %s %s %s" %
                          (str(step), str(ideal[None, :]), str(sr)))
                    print("sample: %s %s %s" %
                          (str(step), str(sample[None, :]), str(sr)))
                    merged = sess.run(
                        tf.summary.merge([
                            tf.summary.audio('ideal' + step + text_string,
                                             ideal[None, :], sr),
                            tf.summary.audio('sample' + step + text_string,
                                             sample[None, :], sr),
                            tf.summary.image('attention' + step,
                                             attention_plot),
                            tf.summary.text(
                                'text' + step,
                                tf.convert_to_tensor(quoted_text_string))
                        ]))
                    train_writer.add_summary(merged, global_step)
                if global_step % 50 == 0:
                    print("This is reassurance. Global step at: %d" %
                          global_step)

            coord.request_stop()
            coord.join(threads)
        reader.close()
def check_tf_records_with_tensorboard(files, data_path, save_path):
    sr = 24000 if "vctk" in save_path else 16000
    meta = data_input.load_meta(data_path)
    ivocab = meta["vocab"]

    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter('log/' + save_path + '/debug',
                                             sess.graph)

        record_placeholder = tf.placeholder(tf.string)
        features_in = tf.parse_single_example(
            record_placeholder,
            features={
                "index": tf.FixedLenFeature([], tf.int64),
                # "stfts": tf.FixedLenFeature((180, 2050), tf.float32),
                "stfts": tf.FixedLenFeature((504, 2050), tf.float32),
                "stfts_shape": tf.FixedLenFeature((2), tf.int64),
                # "mels": tf.FixedLenFeature((180, 160), tf.float32),
                "mels": tf.FixedLenFeature((504, 160), tf.float32),
                "mels_shape": tf.FixedLenFeature((2), tf.int64),
                "texts": tf.VarLenFeature(tf.int64),
                "text_lens": tf.FixedLenFeature([], tf.int64),
                "speech_lens": tf.FixedLenFeature([], tf.int64),
            })

        for file_path in files:
            print("Reading file: %s" % file_path)
            for record, i in zip(tf.python_io.tf_record_iterator(file_path),
                                 range(count_records([file_path]))):
                if i % SAVE_EVERY == 0 and (i != 0 or SAVE_EVERY == 1):
                    print("Iteration %d" % i)
                    features = sess.run(features_in,
                                        feed_dict={record_placeholder: record})
                    texts = features["texts"]
                    texts = tf.sparse_to_dense(texts.indices,
                                               texts.dense_shape, texts.values)

                    # Debugging THIS script.
                    """
                    print("texts (numbers): %s" % str(texts.eval(session=sess)))
                    for word in texts.eval(session=sess):
                        try:
                            print("word: %s" % str(ivocab[word]))
                        except:
                            print("invalid word: %s" % str(word))
                    print("ivocab: %s" % str(ivocab))
                    """

                    # Convert integers to words
                    texts = "".join(
                        filter(lambda x: x != "<pad>", [
                            ivocab[word] for word in texts.eval(session=sess)
                        ]))
                    print("Texts: %s" % texts)
                    texts_filtered = "".join(
                        filter(lambda x: x in set(string.printable), texts))
                    # print("Texts filtered: %s" % texts_filtered)

                    print("stfts shape: %s" % str(features["stfts"].shape))
                    print("mels shape: %s" % str(features["mels"].shape))

                    print("saving sample")
                    # store a sample to listen to
                    ideal = audio.invert_spectrogram(features["stfts"])
                    step = "_" + str(i) + "_"
                    merged = sess.run(
                        tf.summary.merge([
                            tf.summary.audio(
                                "ideal" + step + "\"" + texts_filtered + "\"",
                                ideal[None, :], sr),
                            tf.summary.text(
                                "text" + step,
                                tf.convert_to_tensor(texts_filtered))
                        ]))
                    train_writer.add_summary(merged, i)
Exemplo n.º 7
0
def train(model, config, num_steps=1000000):

    sr = 24000 if 'vctk' in config.data_path else 16000
    meta = data_input.load_meta(config.data_path)
    config.r = meta['r']
    ivocab = meta['vocab']
    config.vocab_size = len(ivocab)

    with tf.Session() as sess:

        # Added comment out
        """
        inputs, names, num_speakers, stft_mean, stft_std = \
                data_input.load_from_npy(config.data_path)
        """

        # Added
        # And THEN commented out
        """
        print("Loading inputs...")
        inputs, names, num_speakers = data_input.load_from_hdf5(config.data_path)
        print("Loaded all inputs!")
        """

        # Added comment out
        """
        config.num_speakers = num_speakers
        """

        # Added comment out
        # save the mean and std as tensorflow variables so they are saved with the weights
        """
        tf.Variable(stft_mean, name='stft_mean')
        tf.Variable(stft_std, name='stft_std')
        """

        print("Building dataset...")
        # Added comment out
        """
        batch_inputs = data_input.build_dataset(sess, inputs, names)
        """

        # Added
        batch_inputs, loader = data_input.build_dataset_with_hdf5(
            os.path.join(config.data_path, "data"))
        print("Built dataset!")
        print("batch_inputs: %s" % str(batch_inputs))
        with loader.begin(sess):
            print("Initializing model...")
            # initialize model
            model = model(config, batch_inputs, train=True)
            print("Model initialized!")

            train_writer = tf.summary.FileWriter(
                'log/' + config.save_path + '/train', sess.graph)

            tf.global_variables_initializer().run()
            coord = tf.train.Coordinator()
            print("Starting queue runners...")
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            print("Started queue runners!")

            saver = tf.train.Saver(max_to_keep=3,
                                   keep_checkpoint_every_n_hours=3)

            if config.restore:
                print('restoring weights')
                latest_ckpt = tf.train.latest_checkpoint(
                    'weights/' +
                    config.save_path[:config.save_path.rfind('/')])
                if RESTORE_FROM is None:
                    if latest_ckpt is not None:
                        saver.restore(sess, latest_ckpt)
                else:
                    saver.restore(
                        sess, 'weights/' + config.save_path + '-' +
                        str(RESTORE_FROM))

            lr = model.config.init_lr
            annealing_rate = model.config.annealing_rate

            print("Looping over num_steps: %s" % str(num_steps))
            for _ in tqdm(range(num_steps)):
                print("Running sess...")
                out = sess.run([
                    model.train_op, model.global_step, model.loss,
                    model.output, model.alignments, model.merged, batch_inputs
                ],
                               feed_dict={model.lr: lr})
                _, global_step, loss, output, alignments, summary, inputs = out
                print("Finished run: %d!" % global_step)

                train_writer.add_summary(summary, global_step)

                # detect gradient explosion
                if loss > 1e8 and global_step > 500:
                    print('loss exploded')
                    break

                if global_step % 1000 == 0:
                    lr *= annealing_rate

                if global_step % SAVE_EVERY == 0 and global_step != 0:

                    print('saving weights')
                    if not os.path.exists('weights/' + config.save_path):
                        os.makedirs('weights/' + config.save_path)
                    saver.save(sess,
                               'weights/' + config.save_path,
                               global_step=global_step)

                    print('saving sample')
                    # store a sample to listen to
                    ideal = audio.invert_spectrogram(inputs['stft'][0] *
                                                     stft_std + stft_mean)
                    sample = audio.invert_spectrogram(output[0] * stft_std +
                                                      stft_mean)
                    attention_plot = data_input.generate_attention_plot(
                        alignments[0])
                    step = '_' + str(global_step)
                    merged = sess.run(
                        tf.summary.merge([
                            tf.summary.audio('ideal' + step, ideal[None, :],
                                             sr),
                            tf.summary.audio('sample' + step, sample[None, :],
                                             sr),
                            tf.summary.image('attention' + step,
                                             attention_plot)
                        ]))
                    train_writer.add_summary(merged, global_step)
                if global_step % 50 == 0:
                    print("This is reassurance. Global step at: %d" %
                          global_step)

            coord.request_stop()
            coord.join(threads)