def main(_): tf.gfile.MkDir(args.output_dir) data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list), filenames=tf.gfile.Glob(args.file_pattern), num_epoch=1) XNOM = data.f[0] XWAV = tf.expand_dims(mu_law_decode(data.x[0, :]), -1) XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', 16000) sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.tables_initializer()) sess.run(data.iterator.initializer) csv = open('vctk.csv', 'w') counter = 1 while True: try: fetch = {'xbin': XBIN, 'xwav': XWAV, 'wav_name': XNOM} result = sess.run(fetch) wav_name = result['wav_name'].decode('utf8') print('\rFile {:05d}: Processing {}'.format(counter, wav_name), end='') csv.write('{}, {:d}\n'.format(wav_name, len(result['xwav']))) filename = os.path.join(args.output_dir, wav_name) + '.wav' with open(filename, 'wb') as fp: fp.write(result['xbin']) counter += 1 except tf.errors.OutOfRangeError: print('\nEpoch complete') break print() csv.close()
def main(unused_args): ''' NOTE: the directory structure must be [args.dir_to_wav]/[Set]/[speakers] ''' if not args.output_dir: raise ValueError('`output_dir` (output dir) should be specified') print('[WARNING] Protobuf is super slow (~7 examples per sec). \n' 'This could take 2 hours or more.') reader = tf.WholeFileReader() files = tf.gfile.Glob(args.file_pattern) filename_queue = tf.train.string_input_producer(files, num_epochs=1, shuffle=False) key, val = reader.read(filename_queue) ''' wav = tf.contrib.ffmpeg.decode_audio(val, args.ext, args.fs, 1) wav = tf.reshape(wav, [-1, ]) mulaw = mu_law_encode(wav) ''' for s in txt2list(args.speaker_list): tf.gfile.MakeDirs(join(args.output_dir, s)) counter = 1 N = len(files) with tf.train.MonitoredSession() as sess: while not sess.should_stop(): filename = sess.run(key).decode('utf8') binary, _ = librosa.load(filename) x_int = mu_law_encode(binary) # TODO: remove this #decoded = mu_law_decode(x_int) #librosa.output.write_wav('testwav-{}.wav', decoded, _) text = read_text(filename) b, _ = splitext(filename) _, b = split(b) s = b.split('_')[0] ex = make_mu_law_speaker_length(x_int, s, text, b) fp = tf.python_io.TFRecordWriter( join(args.output_dir, s, '{}.tfr'.format(b))) fp.write(ex.SerializeToString()) fp.close() print('\rFile {:5d}/{:5d}: {}'.format(counter, N, b), end='') counter += 1 print()
def main(unused_args): if args.logdir is None: raise ValueError('Please specify the dir to the checkpoint') arch = tf.gfile.Glob(join(args.logdir, 'arch*.json'))[0] arch = json2dict(arch) net = VQVAE(arch) data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list), filenames=tf.gfile.Glob(args.file_pattern)) ZH = net.encode(data.x, args.mode) ema = tf.train.ExponentialMovingAverage(decay=0.995) trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()} saver = tf.train.Saver(trg_vars) sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.tables_initializer()) sess.run(data.iterator.initializer) sess.run(tf.global_variables_initializer()) load(saver, sess, args.logdir, ckpt=args.ckpt) hist = np.zeros([ arch['num_exemplar'], ], dtype=np.int64) counter = 1 while True: try: z_ids = sess.run(ZH) print('\rNum of processed files: {:d}'.format(counter), end='') counter += 1 for i in z_ids[0]: # bz = 1 hist[i] += 1 except tf.errors.OutOfRangeError: print() break with open('histogram.npf', 'wb') as fp: hist.tofile(fp) plt.figure(figsize=[10, 2]) plt.plot(np.log10(hist + 1), '.') plt.xlim([0, arch['num_exemplar'] - 1]) plt.ylabel('log-frequency') plt.xlabel('exemplar index') plt.savefig('histogram.png') plt.close()
def main(_): speaker_list = txt2list(args.speaker_list) dirs = validate_log_dirs(args) arch = json2dict(args.arch) arch.update(dirs) arch.update({'ckpt': args.ckpt}) copy_arch_file(args.arch, arch['logdir']) net = VQVAE(arch) P = net.n_padding() print('Receptive field: {} samples ({:.2f} sec)\n'.format(P, P / arch['fs'])) data = ByteWavReader( speaker_list, args.file_pattern, T=arch['T'], batch_size=arch['training']['batch_size'], buffer_size=5000 ) net.train(data)
def main(_): """Train the model based on the command-line arguments.""" # Parse command-line arguments speaker_list = txt2list(args.speaker_list) dirs = validate_log_dirs(args) arch = json2dict(args.arch) arch.update(dirs) arch.update({'ckpt': args.ckpt}) copy_arch_file(args.arch, arch['logdir']) # Initialize the model net = VQVAE(arch) P = net.n_padding() print('Receptive field: {} samples ({:.2f} sec)'.format(P, P / arch['fs'])) # Read the input data as specified by the command line arguments data = ByteWavReader(speaker_list, args.file_pattern, T=arch['T'], batch_size=arch['training']['batch_size'], buffer_size=5000) # Train the model on the input data net.train(data)
def main(unused_args): if args.logdir is None: raise ValueError('Please specify the dir to the checkpoint') speaker_list = txt2list(args.speaker_list) arch = tf.gfile.Glob(os.path.join(args.logdir, 'arch*.json'))[0] arch = json2dict(arch) net = VQVAE(arch) # they start roughly at the same position but end very differently (3 is longest) filenames = [ 'dataset/VCTK/tfr/p227/p227_363.tfr', # 'dataset/VCTK/tfr/p240/p240_341.tfr', # 'dataset/VCTK/tfr/p243/p243_359.tfr', 'dataset/VCTK/tfr/p225/p225_001.tfr' ] data = ByteWavWholeReader(speaker_list, filenames) X = tf.placeholder(dtype=tf.int64, shape=[None, None]) Y = tf.placeholder(dtype=tf.int64, shape=[ None, ]) ZH = net.encode(X, args.mode) XH = net.generate(X, ZH, Y) # XWAV = mu_law_decode(X) # XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', arch['fs']) ema = tf.train.ExponentialMovingAverage(decay=0.995) trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()} saver = tf.train.Saver(trg_vars) logdir = get_default_logdir(args.logdir) tf.gfile.MkDir(logdir) sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.tables_initializer()) sess.run(data.iterator.initializer) results = [] for _ in filenames: result = sess.run({'x': data.x, 'y': data.y}) results.append(result) # results1 = sess.run({'x': data.x, 'y': data.y}) # results2 = sess.run({'x': data.x, 'y': data.y}) length_input = net.n_padding() + 1 # same as padding + 1 ini = 15149 - length_input end = 42285 # x_source1 = results1['x'][:, ini: end] # x_source2 = results2['x'][:, ini: end] for i in range(len(results)): x = results[i]['x'] if x.shape[-1] < end: x = np.concatenate( [x, x[0, 0] + np.zeros([1, end - x.shape[-1]])], -1) results[i]['x'] = x[:, ini:end] # from pdb import set_trace # set_trace() x_source = np.concatenate([ results[0]['x'], results[0]['x'], results[1]['x'], results[1]['x'] ], 0) B = x_source.shape[0] y_input = np.concatenate([ results[0]['y'], results[1]['y'], results[1]['y'], results[0]['y'] ], 0) length_target = x_source.shape[1] - length_input while True: sess.run(tf.global_variables_initializer()) load(saver, sess, args.logdir, ckpt=args.ckpt) z_blend = sess.run(ZH, feed_dict={X: x_source}) x_input = x_source[:, :length_input] z_input = z_blend[:, :length_input, :] # Generate try: x_gen = np.zeros([B, length_target], dtype=np.int64) # + results['x'][0, 0] for i in range(length_target): xh = sess.run(XH, feed_dict={ X: x_input, ZH: z_input, Y: y_input }) z_input = z_blend[:, i + 1:i + 1 + length_input, :] x_input[:, :-1] = x_input[:, 1:] x_input[:, -1] = xh[:, -1] x_gen[:, i] = xh[:, -1] print('\rGenerating {:5d}/{:5d}... x={:3d}'.format( i + 1, length_target, xh[0, -1]), end='', flush=True) except KeyboardInterrupt: print("Interrupted by the user.") finally: print() x_wav = mu_law_decode(x_gen) for i in range(x_wav.shape[0]): x_1ch = np.expand_dims(x_wav[i], -1) # x_bin = sess.run(XBIN, feed_dict={X: x_1ch}) librosa.output.write_wav('testwav-{}.wav'.format(i), x_1ch, arch['fs']) # with open(os.path.join(logdir, 'testwav-{}.wav'.format(i)), 'wb') as fp: # fp.write(x_bin) # For periodic gen. if args.period > 0: try: print('Sleep for a while') sleep(args.period * 60) logdir = get_default_logdir(args.logdir) tf.gfile.MkDir(logdir) except KeyboardInterrupt: print('Stop periodic gen.') break finally: print('all finished') else: break