def build_wavenet(batch_size=1, sample_length=64000): config = Config() x = tf.placeholder(tf.float32, shape=[batch_size, sample_length]) graph = config.build({"wav": x}, is_training=False) graph.update({"X": x}) return graph
def load_style_nsynth(initial): """Load the NSynth autoencoder network for stylizing.""" config = Config() with tf.device("/gpu:0"): initial = initial.reshape([1, -1, 1]) # [Batch_size, length, channel] x = tf.Variable(initial) graph = config.build({"wav": x}, is_training=False) graph.update({"X": x}) return graph
def load_nsynth(batch_size=1, sample_length=40000): # Load the NSynth autoencoder network. config = Config() print("Inside load_nsynth function") with tf.device("/device:GPU:0"): print("Loading nsynth") x = tf.placeholder(tf.float32, shape=[batch_size, sample_length]) graph = config.build({"wav": x}, is_training=False) graph.update({"X": x}) return graph
def load_nsynth(batch_size=1, sample_length=64000): """Load the NSynth autoencoder network. Args: batch_size: Batch size number of observations to process. [1] sample_length: Number of samples in the input audio. [64000] Returns: graph: The network as a dict with input placeholder in {"X"} """ config = Config() with tf.device("/gpu:0"): x = tf.placeholder(tf.float32, shape=[batch_size, sample_length]) graph = config.build({"wav": x}, is_training=False) graph.update({"X": x}) return graph
def encode(wav_data, checkpoint_path, sample_length=64000): """Padded loading of a wave file. Args: wav_data: Numpy array [batch_size, sample_length] checkpoint_path: Location of the pretrained model. sample_length: The total length of the final wave file, padded with 0s. Returns: encoding: a [mb, 125, 16] encoding (for 64000 sample audio file). hop_length: Pooling size of the autoencoder. """ if wav_data.ndim == 1: wav_data = np.expand_dims(wav_data, 0) batch_size = 1 elif wav_data.ndim == 2: batch_size = wav_data.shape[0] # Load up the model for encoding and find the encoding of "wav_data" session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Graph().as_default(), tf.Session(config=session_config) as sess: hop_length = Config().ae_hop_length wav_data, sample_length = utils.trim_for_encoding( wav_data, sample_length, hop_length) net = load_nsynth(batch_size=batch_size, sample_length=sample_length) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) encoding = sess.run(net["encoding"], feed_dict={net["X"]: wav_data}) return encoding, hop_length
def encode(wav_data, checkpoint_path, sample_length=64000): # Generate an array of embeddings from an array of audio. if wav_data.ndim == 1: wav_data = np.expand_dims(wav_data, 0) batch_size = 1 elif wav_data.ndim == 2: batch_size = wav_data.shape[0] session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True # настройка выше - плохая, так как ест при необходимости всю память GPU, лучше # session_config.gpu_options.per_process_gpu_memory_fraction = 0.4 with tf.Graph().as_default(), tf.Session(config=session_config) as sess: hop_length = Config().ae_hop_length # hop_length - это pooling size wav_data, sample_length = trim_for_encoding(wav_data, sample_length, hop_length) net = load_nsynth(batch_size=batch_size, sample_length=sample_length) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) encodings = sess.run(net["encoding"], feed_dict={net["X"]: wav_data}) return encodings
def synthesize(encodings, save_paths, checkpoint_path="model.ckpt-200000", samples_per_save=10000, batch_i=None, total_batches=None): """Synthesize audio from an array of encodings. Args: encodings: Numpy array with shape [batch_size, time, dim]. save_paths: Iterable of output file names. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] samples_per_save: Save files after every amount of generated samples. """ session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=session_config) as sess: net = load_fastgen_nsynth(batch_size=encodings.shape[0]) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) # Get lengths batch_size, encoding_length, _ = encodings.shape hop_length = Config().ae_hop_length total_length = encoding_length * hop_length # initialize queues w/ 0s sess.run(net["init_ops"]) # Regenerate the audio file sample by sample audio_batch = np.zeros((batch_size, total_length), dtype=np.float32) audio = np.zeros([batch_size, 1]) for sample_i in range(total_length): encoding_i = sample_i // hop_length audio = generate_audio_sample(sess, net, audio, encodings[:, encoding_i, :]) audio_batch[:, sample_i] = audio[:, 0] if sample_i % 100 == 0: the_batch_i = batch_i if batch_i != None else 0 the_total_batches = total_batches if total_batches != None else 1 progress = (the_batch_i + float(sample_i + 1) / total_length) / the_total_batches log_str = "{:.1f}% - ".format(progress * 100) if batch_i != None or total_batches != None: log_str += "Batch: {}/{} - ".format( (batch_i + 1) if batch_i != None else "?", total_batches if total_batches != None else "?") log_str += "Sample: {}/{}".format(sample_i + 1, total_length) tf.logging.info(log_str) if sample_i % samples_per_save == 0 and save_paths: save_batch(audio_batch, save_paths) save_batch(audio_batch, save_paths)
def synthesize(encodings, save_paths, checkpoint_path="model.ckpt-200000", samples_per_save=1000): """Synthesize audio from an array of embeddings. Args: encodings: Numpy array with shape [batch_size, time, dim]. save_paths: Iterable of output file names. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] samples_per_save: Save files after every amount of generated samples. """ hop_length = Config().ae_hop_length # Get lengths batch_size = encodings.shape[0] encoding_length = encodings.shape[1] total_length = encoding_length * hop_length session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=session_config) as sess: net = load_fastgen_nsynth(batch_size=batch_size) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) # initialize queues w/ 0s sess.run(net["init_ops"]) # Regenerate the audio file sample by sample audio_batch = np.zeros( ( batch_size, total_length, ), dtype=np.float32) audio = np.zeros([batch_size, 1]) for sample_i in range(total_length): enc_i = sample_i // hop_length pmf = sess.run( [net["predictions"], net["push_ops"]], feed_dict={ net["X"]: audio, net["encoding"]: encodings[:, enc_i, :] })[0] sample_bin = sample_categorical(pmf) audio = utils.inv_mu_law_numpy(sample_bin - 128) audio_batch[:, sample_i] = audio[:, 0] if sample_i % 100 == 0: tf.logging.info("Sample: %d" % sample_i) if sample_i % samples_per_save == 0: save_batch(audio_batch, save_paths) save_batch(audio_batch, save_paths)
def synthesize(encodings, save_paths, checkpoint_path, samples_per_save=1000): hop_length = Config().ae_hop_length # Get lengths batch_size = encodings.shape[0] encoding_length = encodings.shape[1] total_length = encoding_length * hop_length session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=session_config) as sess: net = load_fastgen_nsynth(batch_size=batch_size) saver = tf.train.Saver() saver.restore(sess, checkpoint_path) sess.run(net["init_ops"]) audio_batch = np.zeros((batch_size, total_length), dtype=np.float32) audio = np.zeros([batch_size, 1]) for sample_i in range(total_length): enc_i = sample_i // hop_length pmf = sess.run([net["predictions"], net["push_ops"]], feed_dict={ net["X"]: audio, net["encoding"]: encodings[:, enc_i, :] })[0] sample_bin = sample_categorical(pmf) audio = inv_mu_law_numpy(sample_bin - 128) audio_batch[:, sample_i] = audio[:, 0] if sample_i % 100 == 0: print("Sample: {}".format(sample_i)) if sample_i % samples_per_save == 0: save_batch(audio_batch, save_paths) save_batch(audio_batch, save_paths)
def synthesize(source_file, checkpoint_path="model.ckpt-200000", out_file="synthesis.wav", sample_length=64000, samples_per_save=1000): """Resynthesize an input audio file. Args: source_file: Location of a wave or .npy file to load. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] out_file: Location to save the synthesized wave file. [synthesis.wav] sample_length: Length of file to synthesize. [source_file.length] samples_per_save: Save a .wav after every amount of samples. Raises: RuntimeError: Source_file should be .wav or .npy. """ if source_file.endswith(".npy"): encoding = np.load(source_file) hop_length = Config().ae_hop_length elif source_file.endswith(".wav"): # Audio to resynthesize wav_data = utils.load_audio(source_file, sample_length, sr=16000) # Load up the model for encoding and find the encoding encoding, hop_length = encode(wav_data, checkpoint_path, sample_length=sample_length) if encoding.ndim == 3: encoding = encoding[0] else: raise RuntimeError("File must be .wav or .npy") # Get lengths encoding_length = encoding.shape[0] total_length = encoding_length * hop_length session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Graph().as_default(), tf.Session(config=session_config) as sess: net = load_fastgen_nsynth() saver = tf.train.Saver() saver.restore(sess, checkpoint_path) # initialize queues w/ 0s sess.run(net["init_ops"]) # Regenerate the audio file sample by sample wav_synth = np.zeros((total_length, ), dtype=np.float32) audio = np.float32(0) for sample_i in range(total_length): enc_i = sample_i // hop_length pmf = sess.run([net["predictions"], net["push_ops"]], feed_dict={ net["X"]: np.atleast_2d(audio), net["encoding"]: encoding[enc_i] })[0] sample_bin = sample_categorical(pmf) audio = utils.inv_mu_law_numpy(sample_bin - 128) wav_synth[sample_i] = audio if sample_i % 100 == 0: tf.logging.info("Sample: %d" % sample_i) if sample_i % samples_per_save == 0: wavfile.write(out_file, 16000, wav_synth) wavfile.write(out_file, 16000, wav_synth)