예제 #1
0
def main():
    logdir, ckpt = os.path.split(args.checkpoint)
    arch = tf.gfile.Glob(os.path.join(
        logdir, 'architecture*.json'))[0]  # should only be 1 file
    with open(arch) as fp:
        arch = json.load(fp)

    normalizer = Tanhize(
        xmax=np.fromfile('./etc/xmax.npf'),
        xmin=np.fromfile('./etc/xmin.npf'),
    )

    features = read_whole_features(args.file_pattern.format(args.src))

    x = normalizer.forward_process(features['sp'])
    x = nh_to_nchw(x)
    y_s = features['speaker']
    y_t_id = tf.placeholder(dtype=tf.int64, shape=[
        1,
    ])
    y_t = y_t_id * tf.ones(shape=[
        tf.shape(x)[0],
    ], dtype=tf.int64)

    machine = MODEL(arch)
    z = machine.encode(x)
    x_t = machine.decode(z, y_t)  # NOTE: the API yields NHWC format
    x_t = tf.squeeze(x_t)
    x_t = normalizer.backward_process(x_t)

    # For sanity check (validation)
    x_s = machine.decode(z, y_s)
    x_s = tf.squeeze(x_s)
    x_s = normalizer.backward_process(x_s)

    f0_s = features['f0']
    f0_t = convert_f0(f0_s, args.src, args.trg)

    output_dir = get_default_output(args.output_dir)

    saver = tf.train.Saver()
    sv = tf.train.Supervisor(logdir=output_dir)
    with sv.managed_session() as sess:
        load(saver, sess, logdir, ckpt=ckpt)
        while True:
            try:
                feat, f0, sp = sess.run(
                    [features, f0_t, x_t],
                    feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])})
                feat.update({'sp': sp, 'f0': f0})
                y = pw2wav(feat)
                oFilename = make_output_wav_name(output_dir, feat['filename'])
                sf.write(oFilename, y, FS)
            except:
                break
예제 #2
0
def main():
    logdir, ckpt = os.path.split(args.checkpoint)
    arch = tf.gfile.Glob(os.path.join(logdir, 'architecture*.json'))[0]  # should only be 1 file
    with open(arch) as fp:
        arch = json.load(fp)

    normalizer = Tanhize(
        xmax=np.fromfile('./etc/xmax.npf'),
        xmin=np.fromfile('./etc/xmin.npf'),
    )

    features = read_whole_features(args.file_pattern.format(args.src))

    x = normalizer.forward_process(features['sp'])
    x = nh_to_nchw(x)
    y_s = features['speaker']
    y_t_id = tf.placeholder(dtype=tf.int64, shape=[1,])
    y_t = y_t_id * tf.ones(shape=[tf.shape(x)[0],], dtype=tf.int64)

    machine = MODEL(arch)
    z = machine.encode(x)
    x_t = machine.decode(z, y_t)  # NOTE: the API yields NHWC format
    x_t = tf.squeeze(x_t)
    x_t = normalizer.backward_process(x_t)

    # For sanity check (validation)
    x_s = machine.decode(z, y_s)
    x_s = tf.squeeze(x_s)
    x_s = normalizer.backward_process(x_s)

    f0_s = features['f0']
    f0_t = convert_f0(f0_s, args.src, args.trg)

    output_dir = get_default_output(args.output_dir)

    saver = tf.train.Saver()
    sv = tf.train.Supervisor(logdir=output_dir)
    with sv.managed_session() as sess:
        load(saver, sess, logdir, ckpt=ckpt)
        while True:
            try:
                feat, f0, sp = sess.run(
                    [features, f0_t, x_t],
                    feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])}
                )
                feat.update({'sp': sp, 'f0': f0})
                y = pw2wav(feat)
                oFilename = make_output_wav_name(output_dir, feat['filename'])
                sf.write(oFilename, y, FS)
            except:
                break
def main(unused_args):
    if args.logdir is None:
        raise ValueError('Please specify the dir to the checkpoint')

    arch = tf.gfile.Glob(join(args.logdir, 'arch*.json'))[0]
    arch = json2dict(arch)

    net = VQVAE(arch)

    data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list),
                              filenames=tf.gfile.Glob(args.file_pattern))

    ZH = net.encode(data.x, args.mode)

    ema = tf.train.ExponentialMovingAverage(decay=0.995)
    trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()}
    saver = tf.train.Saver(trg_vars)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=sess_config) as sess:
        sess.run(tf.tables_initializer())
        sess.run(data.iterator.initializer)
        sess.run(tf.global_variables_initializer())
        load(saver, sess, args.logdir, ckpt=args.ckpt)

        hist = np.zeros([
            arch['num_exemplar'],
        ], dtype=np.int64)
        counter = 1
        while True:
            try:
                z_ids = sess.run(ZH)
                print('\rNum of processed files: {:d}'.format(counter), end='')
                counter += 1
                for i in z_ids[0]:  # bz = 1
                    hist[i] += 1
            except tf.errors.OutOfRangeError:
                print()
                break

        with open('histogram.npf', 'wb') as fp:
            hist.tofile(fp)

        plt.figure(figsize=[10, 2])
        plt.plot(np.log10(hist + 1), '.')
        plt.xlim([0, arch['num_exemplar'] - 1])
        plt.ylabel('log-frequency')
        plt.xlabel('exemplar index')
        plt.savefig('histogram.png')
        plt.close()
예제 #4
0
def main(unused_args=None):
    # args(sys.argv)

    if args.model is None:
        raise ValueError(
            '\n  You MUST specify `model`.' +\
            '\n    Use `python convert.py --help` to see applicable options.'
        )

    module = import_module(args.module, package=None)
    MODEL = getattr(module, args.model)

    FS = 16000

    with open(args.speaker_list) as fp:
        SPEAKERS = [l.strip() for l in fp.readlines()]

    logdir, ckpt = os.path.split(args.checkpoint)
    if 'VAE' in logdir:
        _path_to_arch, _ = os.path.split(logdir)
    else:
        _path_to_arch = logdir
    arch = tf.gfile.Glob(os.path.join(_path_to_arch, 'architecture*.json'))
    if len(arch) != 1:
        print('WARNING: found more than 1 architecture files!')
    arch = arch[0]
    with open(arch) as fp:
        arch = json.load(fp)

    normalizer = Tanhize(
        xmax=np.fromfile('./etc/{}_xmax.npf'.format(args.corpus_name)),
        xmin=np.fromfile('./etc/{}_xmin.npf'.format(args.corpus_name)),
    )

    features = read_whole_features(args.file_pattern.format(args.src))

    x = normalizer.forward_process(features['sp'])
    x = nh_to_nhwc(x)
    y_s = features['speaker']
    y_t_id = tf.placeholder(dtype=tf.int64, shape=[
        1,
    ])
    y_t = y_t_id * tf.ones(shape=[
        tf.shape(x)[0],
    ], dtype=tf.int64)

    f0_t = features['f0']
    #    f0_t = convert_f0(f0_s, args.src, args.trg)
    #    f0_s_convert = tf.cast(f0_s,dtype=tf.int64)
    f0_t_convert = tf.cast(f0_t, dtype=tf.int64)
    machine = MODEL(arch, is_training=False)
    z = machine.encode(x)
    x_t = machine.decode(z, y_t,
                         f0_t_convert)  # NOTE: the API yields NHWC format
    x_t = tf.squeeze(x_t)
    x_t = normalizer.backward_process(x_t)

    output_dir = get_default_output(args.output_dir)

    saver = tf.train.Saver()
    sv = tf.train.Supervisor(logdir=output_dir)
    with sv.managed_session() as sess:
        load(saver, sess, logdir, ckpt=ckpt)
        print()
        while True:
            try:
                s_time = time.perf_counter()
                feat, f0, sp = sess.run(
                    [features, f0_t, x_t],
                    feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])})
                feat.update({'sp': sp, 'f0': f0})
                y = pw2wav(feat)
                oFilename = make_output_wav_name(output_dir, feat['filename'])
                print('\rProcessing {}'.format(oFilename), end='')
                e_time = time.perf_counter()
                print('\n')
                print('Time_sp: {}'.format(e_time - s_time))
                print('\n')
                sf.write(oFilename, y, FS)
            except KeyboardInterrupt:
                break
            finally:
                pass
        print()
예제 #5
0
def main(unused_args):
    if args.logdir is None:
        raise ValueError('Please specify the dir to the checkpoint')

    speaker_list = txt2list(args.speaker_list)

    arch = tf.gfile.Glob(os.path.join(args.logdir, 'arch*.json'))[0]
    arch = json2dict(arch)

    net = VQVAE(arch)

    # they start roughly at the same position but end very differently (3 is longest)
    filenames = [
        'dataset/VCTK/tfr/p227/p227_363.tfr',
        # 'dataset/VCTK/tfr/p240/p240_341.tfr',
        # 'dataset/VCTK/tfr/p243/p243_359.tfr',
        'dataset/VCTK/tfr/p225/p225_001.tfr'
    ]
    data = ByteWavWholeReader(speaker_list, filenames)

    X = tf.placeholder(dtype=tf.int64, shape=[None, None])
    Y = tf.placeholder(dtype=tf.int64, shape=[
        None,
    ])
    ZH = net.encode(X, args.mode)
    XH = net.generate(X, ZH, Y)
    # XWAV = mu_law_decode(X)
    # XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', arch['fs'])

    ema = tf.train.ExponentialMovingAverage(decay=0.995)
    trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()}
    saver = tf.train.Saver(trg_vars)

    logdir = get_default_logdir(args.logdir)
    tf.gfile.MkDir(logdir)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=sess_config) as sess:
        sess.run(tf.tables_initializer())
        sess.run(data.iterator.initializer)

        results = []
        for _ in filenames:
            result = sess.run({'x': data.x, 'y': data.y})
            results.append(result)
        # results1 = sess.run({'x': data.x, 'y': data.y})
        # results2 = sess.run({'x': data.x, 'y': data.y})

        length_input = net.n_padding() + 1  # same as padding + 1

        ini = 15149 - length_input
        end = 42285
        # x_source1 = results1['x'][:, ini: end]
        # x_source2 = results2['x'][:, ini: end]
        for i in range(len(results)):
            x = results[i]['x']
            if x.shape[-1] < end:
                x = np.concatenate(
                    [x, x[0, 0] + np.zeros([1, end - x.shape[-1]])], -1)
            results[i]['x'] = x[:, ini:end]

        # from pdb import set_trace
        # set_trace()
        x_source = np.concatenate([
            results[0]['x'], results[0]['x'], results[1]['x'], results[1]['x']
        ], 0)

        B = x_source.shape[0]

        y_input = np.concatenate([
            results[0]['y'], results[1]['y'], results[1]['y'], results[0]['y']
        ], 0)

        length_target = x_source.shape[1] - length_input

        while True:
            sess.run(tf.global_variables_initializer())
            load(saver, sess, args.logdir, ckpt=args.ckpt)

            z_blend = sess.run(ZH, feed_dict={X: x_source})
            x_input = x_source[:, :length_input]

            z_input = z_blend[:, :length_input, :]

            # Generate
            try:
                x_gen = np.zeros([B, length_target],
                                 dtype=np.int64)  # + results['x'][0, 0]
                for i in range(length_target):
                    xh = sess.run(XH,
                                  feed_dict={
                                      X: x_input,
                                      ZH: z_input,
                                      Y: y_input
                                  })
                    z_input = z_blend[:, i + 1:i + 1 + length_input, :]
                    x_input[:, :-1] = x_input[:, 1:]
                    x_input[:, -1] = xh[:, -1]
                    x_gen[:, i] = xh[:, -1]
                    print('\rGenerating {:5d}/{:5d}... x={:3d}'.format(
                        i + 1, length_target, xh[0, -1]),
                          end='',
                          flush=True)
            except KeyboardInterrupt:
                print("Interrupted by the user.")
            finally:
                print()
                x_wav = mu_law_decode(x_gen)
                for i in range(x_wav.shape[0]):
                    x_1ch = np.expand_dims(x_wav[i], -1)
                    # x_bin = sess.run(XBIN, feed_dict={X: x_1ch})

                    librosa.output.write_wav('testwav-{}.wav'.format(i), x_1ch,
                                             arch['fs'])
                    # with open(os.path.join(logdir, 'testwav-{}.wav'.format(i)), 'wb') as fp:
                    #  fp.write(x_bin)

            # For periodic gen.
            if args.period > 0:
                try:
                    print('Sleep for a while')
                    sleep(args.period * 60)
                    logdir = get_default_logdir(args.logdir)
                    tf.gfile.MkDir(logdir)
                except KeyboardInterrupt:
                    print('Stop periodic gen.')
                    break
                finally:
                    print('all finished')
            else:
                break
예제 #6
0
def main(unused_args=None):

    if args.model is None:
        raise ValueError(
            '\n  You MUST specify `model`.' +\
            '\n    Use `python convert.py --help` to see applicable options.'
        )

    module = import_module(args.module_original, package=None)
    MODEL = getattr(module, args.model)

    FS = 16000

    with open(args.speaker_list) as fp:
        SPEAKERS = [l.strip() for l in fp.readlines()]

    logdir_f0, ckpt_f0_cwt = os.path.split(args.checkpoint_f0_cwt)

    # f0:
    if 'VAE' in logdir_f0:
        _path_to_arch, _ = os.path.split(logdir_f0)
    else:
        _path_to_arch = logdir_f0
    arch_f0 = tf.gfile.Glob(os.path.join(_path_to_arch, 'architecture*.json'))
    if len(arch_f0) != 1:
        print('WARNING: found more than 1 architecture files!')
    arch_f0 = arch_f0[0]
    with open(arch_f0) as fp:
        arch_f0 = json.load(fp)

    features = read_whole_features(args.file_pattern.format(args.src))

    f0_cwt = features['lf0_cwt_norm']
    f0_cwt = nh_to_nhwc(f0_cwt)

    y_s_f0 = features['speaker']
    y_t_id_f0 = tf.placeholder(dtype=tf.int64, shape=[
        1,
    ])
    y_t_f0 = y_t_id_f0 * tf.ones(shape=[
        tf.shape(f0_cwt)[0],
    ], dtype=tf.int64)
    if not os.path.isdir('./f0_results'):
        os.mkdir('./f0_results')


# convert f0:
    machine_f0 = MODEL(arch_f0, is_training=False)
    z_f0 = machine_f0.encode(f0_cwt)
    f0_cwt_t = machine_f0.decode(z_f0,
                                 y_t_f0)  # NOTE: the API yields NHWC format
    f0_cwt_t = tf.squeeze(f0_cwt_t)

    output_dir = get_default_output(args.output_dir)
    saver = tf.train.Saver()
    sv = tf.train.Supervisor(logdir=output_dir)
    with sv.managed_session() as sess:
        load(saver, sess, logdir_f0, ckpt=ckpt_f0_cwt)
        print()
        while True:
            try:
                feat, lf0_cwt = sess.run([features, f0_cwt_t],
                                         feed_dict={
                                             y_t_id_f0:
                                             np.asarray(
                                                 [SPEAKERS.index(args.trg)])
                                         })
                feat.update({'lf0_cwt': lf0_cwt})

                feats = dic2npy(feat)
                oFilename = make_output_bin_name(output_dir, feat['filename'])

                with open(join('./f0_results', '{}.bin'.format(oFilename)),
                          'wb') as fp:
                    fp.write(feats.tostring())

            except KeyboardInterrupt:
                break
            finally:
                pass
        print()
예제 #7
0
def main():
    '''
    Note:
      1. The input is rescaled to [-1, 1] (img_reader: rtype)
    '''
    dirs = validate_log_dirs(args)

    coord = tf.train.Coordinator()

    with open(args.architecture) as f:
        arch = json.load(f)

    imgs, info = img_reader(datadir=args.datadir,
                            img_dims=arch['hwc'],
                            batch_size=args.batch_size,
                            rtype='tanh')

    machine = VAEGAN(arch, is_training=True)

    loss = machine.loss(imgs)
    xh = machine.sample(args.batch_size)

    x_interp = machine.interpolate(imgs[0], imgs[1], N_INTERP)

    opt_d, opt_g, opt_e = get_optimization_ops(loss, args, arch['mode'])

    # # ========== For embedding =============
    # h, w, c = arch['hwc']
    # img4em = tf.Variable(
    #     np.reshape(
    #         np.fromfile(
    #             SPRITE_NUMPY_FILE, np.float32),
    #             [N_VISUALIZE, h, w, c]),
    #     name='emb_input_img')
    # codes = machine.encode(img4em)
    # em_var = tf.Variable(
    #     tf.zeros((N_VISUALIZE, arch['z_dim'])),
    #     name='embeddings')
    # # ======================================

    writer = tf.train.SummaryWriter(dirs['logdir'])
    writer.add_graph(tf.get_default_graph())

    summary_op = tf.merge_all_summaries()

    with open(os.path.join(dirs['logdir'], args.architecture), 'w') as f:
        json.dump(arch, f)

    if args.gpu_cfg:
        with open(args.gpu_cfg) as f:
            cfg = json.load(f)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg[
            'per_process_gpu_memory_fraction'])
        session_conf = tf.ConfigProto(
            allow_soft_placement=cfg['allow_soft_placement'],
            log_device_placement=cfg['log_device_placement'],
            inter_op_parallelism_threads=cfg['inter_op_parallelism_threads'],
            intra_op_parallelism_threads=cfg['intra_op_parallelism_threads'],
            gpu_options=gpu_options)
        sess = tf.Session(config=session_conf)
    else:
        sess = tf.Session()

    init = tf.global_variables_initializer()
    sess.run(init)

    saver = tf.train.Saver()  # tf.global_variables()
    try:
        saved_global_step = load(saver, sess, dirs['restore_from'])
        if saved_global_step is None:
            saved_global_step = -1
    except:
        print("Something's wrong while restoing checkpoints!")
        raise

    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # # ========== For embedding =============
    # ass_op = tf.assign(em_var, codes['mu'], name='X/em_var')

    # config = projector.ProjectorConfig()
    # embedding = config.embeddings.add()
    # embedding.tensor_name = em_var.name
    # print(em_var.name, em_var.get_shape())
    # embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE
    # embedding.sprite.single_image_dim.extend([w, h])
    # embedding.metadata_path = PATH_TO_LABEL
    # projector.visualize_embeddings(writer, config)
    # # =====================================

    # ========== Actual training loop ==========
    try:
        n_iter_per_epoch = info['n_files'] // args.batch_size
        time_i = time.time()
        step = 0
        for ep in range(args.n_epoch):
            for it in range(n_iter_per_epoch):
                _, l_df, l_dr = sess.run(
                    [opt_d, loss['D_fake'], loss['D_real']])

                # Update G twice
                _, l_g = sess.run([opt_g, loss['G_fake']])
                _, l_g = sess.run([opt_g, loss['G_fake']])
                if arch['mode'] == 'VAE-GAN':
                    _, l_e, l_dis = sess.run(
                        [opt_e, loss['KL(z)'], loss['Dis']])

                # Message
                msg = 'Epoch [{:3d}/{:3d}] '.format(ep + 1, args.n_epoch)\
                    + '[{:4d}/{:4d}] '.format(it + 1, n_iter_per_epoch)\
                    + 'd_loss={:6.3f}+{:6.3f}, '.format(l_df, l_dr)\
                    + 'g_loss={:5.2f}, '.format(l_g)

                if arch['mode'] == 'VAE-GAN':
                    msg += 'KLD={:6.3f}, DIS={:6.3f}, '.format(l_e, l_dis)

                msg += 'T={:.2f}'.format(time.time() - time_i)
                print(msg)

                # writer.add_summary(summary, step)

                # Demo/Output
                if it % (n_iter_per_epoch // 1) == 0:
                    summary = sess.run(summary_op)
                    writer.add_summary(summary, step)

                    if arch['mode'] == 'VAE-GAN':
                        visualize_interpolation(
                            sess,
                            x_interp,
                            filename=os.path.join(
                                dirs['logdir'],
                                'test-Ep{:03d}-It{:04d}.png'.format(ep, it)))
                        # sess.run(ass_op)

                    visualize_random_samples(
                        sess,
                        xh,
                        filename=os.path.join(
                            dirs['logdir'],
                            'test-Ep{:03d}-It{:04d}-dc.png'.format(ep, it)))

                    save(saver, sess, dirs['logdir'], step)

                step += 1

    except KeyboardInterrupt:
        print()

    finally:
        save(saver, sess, dirs['logdir'], step)
        coord.request_stop()
        coord.join(threads)
예제 #8
0
  def train(self, data):
    hyperp = self.arch['training']

    loss = self.loss(data.x, data.y)
    opt = self._optimize(loss)

    Z = self._Enc(self._D2A(data.x))
    update_encoding = tf.assign(self.encodings, self.encoding_placeholder)


    K, D = self.arch['num_exemplar'], self.arch['dim_exemplar']
    z_emp = tf.placeholder(tf.float32, [K, D], 'z_emp')
    init_z_emb = tf.assign(self.z_emb, z_emp)

    ema = opt['ema']
    trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()}

    saver = tf.train.Saver(trg_vars)

    sess_config = tf.ConfigProto(
      allow_soft_placement=True,
      gpu_options=tf.GPUOptions(allow_growth=True)
    )
    scaffold = tf.train.Scaffold(
      local_init_op=tf.group(
        tf.local_variables_initializer(),
        data.iterator.initializer,
        tf.tables_initializer()
      )
    )
    with tf.train.MonitoredTrainingSession(
      scaffold=scaffold,
      checkpoint_dir=self.arch['logdir'],
      save_checkpoint_secs=360,
      save_summaries_secs=120,
      config=sess_config,
    ) as sess:
      dummy_path = self._make_dummy_tsv()
      visualize_embeddings(
        logdir=self.arch['logdir'],
        var_list=[self.y_emb, self.encodings],
        tsv_list=['etc/speakers_label.tsv', dummy_path],
      )

      if self.arch['restore_from']:
        load(saver, sess, self.arch['restore_from'], ckpt=self.arch['ckpt'])

      # ========== Initialize exemplars with Enc output  ==========
      multiplier = 100
      exe = np.zeros([0, D])
      while exe.shape[0] < K * multiplier:
        z = sess.run(Z)
        exe = np.concatenate([exe, np.reshape(z, [-1, D])], 0)
      np.unique(exe, axis=0)
      np.random.shuffle(exe)
      sess.run(init_z_emb, feed_dict={z_emp: exe[:K, :]})
      

      # ========== Main training loop ==========
      maxIter = hyperp['maxIter']
      for it in range(maxIter):
        sess.run(opt['trn'])
        if it % hyperp['refresh_freq'] == 1:
          self._get_and_update_encodings(sess, Z, data.y, update_encoding)
          fetches = {'l': loss['reconst']}
          results = sess.run(fetches)
          print('\rIter {:5d}: loss = {:.4e}'.format(it, results['l']), end='')
      print()