def load_batch_audio(files, sample_length=64000): """Load a batch of audio from either .wav files. Args: files: A list of filepaths to .wav files. sample_length: Maximum sample length Returns: batch: A padded array of audio [n_files, sample_length] """ batch = [] # Load the data for f in files: data = utils.load_audio(f, sample_length, sr=16000) length = data.shape[0] # Add padding if less than sample length if length < sample_length: padded = np.zeros([sample_length]) padded[:length] = data batch.append(padded) else: batch.append(data) # Return as an numpy array batch = np.array(batch) return batch
def wavenet_encode(self, file_path, **kwargs): if os.path.exists('../../Pretrained_models/wavenet-ckpt/'): # Load the model weights. checkpoint_path = '../../Pretrained_models/wavenet-ckpt/model.ckpt-200000' else: raise Exception( 'you should download pretrained model to pretrained_models folder make prediction, the link is: http://download.magenta.tensorflow.org/models/nsynth/wavenet-ckpt.tar' ) # Load and downsample the audio. neural_sample_rate = 16000 audio = utils.load_audio(file_path, sample_length=400000, sr=neural_sample_rate) # Pass the audio through the first half of the autoencoder, # to get a list of latent variables that describe the sound. # Note that it would be quicker to pass a batch of audio # to fastgen. encoding = fastgen.encode(audio, checkpoint_path, len(audio)) # Reshape to a single sound. return encoding.reshape((-1, 16))
def load_encoding(fname, sample_lenght=None, sr=16000, ckpt='model.ckpt-200000'): audio = utils.load_audio(fname, sample_length=sample_lenght, sr=sr) encoding = fastgen.encode(audio, ckpt, sample_lenght) return audio, encoding
def encode(wav_filenames: List[str], checkpoint: str = "checkpoints/wavenet-ckpt/model.ckpt-200000", sample_length: int = 16000, sample_rate: int = 16000) -> List[np.ndarray]: """ Encodes the list of filename to encodings by loading the wav files, encoding them using fastgen, and returning the result. :param wav_filenames: the list of filenames to encode, they need to be present in the "sound" folder :param checkpoint: the checkpoint folder :param sample_length: the sample length, can be calculated by multiplying the desired number of seconds by 16000 :param sample_rate: the sample rate, should be 16000 """ if not wav_filenames: return [] # Loads the audio for each filenames audios = [] for wav_filename in wav_filenames: audio = utils.load_audio(os.path.join("sounds", wav_filename), sample_length=sample_length, sr=sample_rate) audios.append(audio) # Encodes the audio for each new wav audios = np.array(audios) encodings = fastgen.encode(audios, checkpoint, sample_length) return encodings
def load_batch(files, sample_length=64000): """Load a batch of data from either .wav or .npy files. Args: files: A list of filepaths to .wav or .npy files sample_length: Maximum sample length Returns: batch_data: A padded array of audio or embeddings [batch, length, (dims)] """ batch_data = [] max_length = 0 is_npy = (os.path.splitext(files[0])[1] == ".npy") # Load the data for f in files: if is_npy: data = np.load(f) batch_data.append(data) else: data = utils.load_audio(f, sample_length, sr=16000) batch_data.append(data) if data.shape[0] > max_length: max_length = data.shape[0] # Add padding for i, data in enumerate(batch_data): if data.shape[0] < max_length: if is_npy: padded = np.zeros([max_length, +data.shape[1]]) padded[:data.shape[0], :] = data else: padded = np.zeros([max_length]) padded[:data.shape[0]] = data batch_data[i] = padded # Return arrays batch_data = np.array(batch_data) return batch_data
def decode(fname, sample_length=44100, sr=16000): fastgen.synthesize(encoding, save_paths=['gen_' + fname], samples_per_save=sample_length) synthesis = utils.load_audio('gen_' + fname, sample_length=sample_length, sr=sr) return synthesis
def wavenet_encode(file_path): neural_sample_rate = 16000 audio = utils.load_audio(file_path, sample_length=400000, sr=neural_sample_rate) encoding = fastgen.encode(audio, '../wavenet-ckpt/model.ckpt-200000', len(audio)) return encoding.reshape((-1, 16))
def load_encoding(fname, sample_length=None, sr=16000, ckpt='model.ckpt-200000'): '''sound loading''' audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) '''encoding''' encoding = fastgen.encode(audio, ckpt, sample_length) print("(batch_size, time_steps, dimensions) :",encoding.shape) np.save(fname[fname.rfind('/') + 1:] + '.npy', encoding) return audio, encoding
def upload(sample_length, sr): '''Upload a .wav file.''' filemap = files.upload() file_list, audio_list = [], [] for key, value in filemap.iteritems(): fname = os.path.join('/content/', key) with open(fname, 'w') as f: f.write(value) audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) file_list.append(fname) audio_list.append(audio) return file_list, audio_list
def load_encoding(_file, sample_length=None, sample_rate=16000, ckpt='model.ckpt-200000'): ''' Resamples signal to <sample_rate> and truncates it to <sample_length> elements Then encodes it through the model <ckpt> Returns a tuple (signal, encoded_signal) ''' audio = utils.load_audio(_file, sample_length=sample_length, sr=sample_rate) encoding = fastgen.encode(audio, ckpt, sample_length) return audio, encoding
def encode(path, filename): print('encoding..') sr = 16000 audio = utils.load_audio(path, sample_length=40000, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) model_path = '/home/paperspace/data/wavenet-ckpt/model.ckpt-200000' encoding = fastgen.encode(audio, model_path, sample_length) print(encoding.shape) print('finished encoding..') # np.save(fname + '.npy', encoding) decode(encoding, path, filename, sample_length, model_path)
def encode(paths: List[str], sample_length: int = 16000, sample_rate: int = 16000, checkpoint: str = "checkpoints/wavenet-ckpt/model.ckpt-200000") \ -> np.ndarray: audios = [] for path in paths: audio = utils.load_audio(path, sample_length=sample_length, sr=sample_rate) audios.append(audio) audios = np.array(audios) encodings = fastgen.encode(audios, checkpoint, sample_length) return encodings
def load_encoding(fname, sample_length=None, sr=16000, ckpt='model.ckpt-200000'): '''sound loading''' audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) '''encoding''' encoding = fastgen.encode(audio, ckpt, sample_length) print("(batch_size, time_steps, dimensions) :", encoding.shape) np.save(fname[fname.rfind('/') + 1:] + '.npy', encoding) return audio, encoding
def get_rb_vector(fname, sr=15360, window_size=16): audio = utils.load_audio(fname, sample_length=-1, sr=sr) sample_length = audio.shape[0] spec = utils.specgram(audio, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False) mag = spec[:, :, 0] dphase = spec[:, :, 1] rb_vector = np.concatenate((dphase, mag), axis=0) return rb_vector[window_size / 2:, :]
def wavenet_encode(file_path): # Load the model weights. checkpoint_path = './wavenet-ckpt/model.ckpt-200000' # Load and downsample the audio. neural_sample_rate = 16000 audio = utils.load_audio(file_path, sample_length=400000, sr=neural_sample_rate) # Pass the audio through the first half of the autoencoder, # to get a list of latent variables that describe the sound. # Note that it would be quicker to pass a batch of audio # to fastgen. encoding = fastgen.encode(audio, checkpoint_path, len(audio)) # Reshape to a single sound. return encoding.reshape((-1, 16))
def load_audio(wav_file, sample_length=64000): """Summary Parameters ---------- wav_file : TYPE Description sample_length : int, optional Description Returns ------- TYPE Description """ wav_data = np.array([utils.load_audio(wav_file)[:sample_length]]) wav_data_padded = np.zeros((1, sample_length)) wav_data_padded[0, :wav_data.shape[1]] = wav_data wav_data = wav_data_padded return wav_data
def load_batch(files, sample_length=64000): """Load a batch of data from either .wav or .npy files. Args: files: A list of filepaths to .wav or .npy files sample_length: Maximum sample length Returns: batch_data: A padded array of audio or embeddings [batch, length, (dims)] """ batch_data = [] max_length = 0 is_npy = (os.path.splitext(files[0])[1] == ".npy") # Load the data for f in files: if is_npy: data = np.load(f) batch_data.append(data) else: data = utils.load_audio(f, sample_length, sr=16000) batch_data.append(data) if data.shape[0] > max_length: max_length = data.shape[0] # Add padding for i, data in enumerate(batch_data): if data.shape[0] < max_length: if is_npy: padded = np.zeros([max_length, +data.shape[1]]) padded[:data.shape[0], :] = data else: padded = np.zeros([max_length]) padded[:data.shape[0]] = data batch_data[i] = padded else: batch_data[i] = data[np.newaxis, :, :] # Return arrays batch_data = np.vstack(batch_data) return batch_data
def encode(): # from https://www.freesound.org/people/MustardPlug/sounds/395058/ # fname = '395058__mustardplug__breakbeat-hiphop-a4-4bar-96bpm.wav' fname = './wav/mehldau-1.wav' sr = 44100 audio = utils.load_audio(fname, sample_length=44100, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) encoding = fastgen.encode(audio, './wavenet-ckpt/model.ckpt-200000', sample_length) print(encoding.shape) np.save(fname + '.npy', encoding) fig, axs = plt.subplots(2, 1, figsize=(10, 5)) axs[0].plot(audio) axs[0].set_title('Audio Signal') axs[1].plot(encoding[0]) axs[1].set_title('NSynth Encoding') return encoding
def Plot_SingleFile(file_name, sampleRate): audio = utils.load_audio( file_name, sample_length=70000) #sample_length for how long will it be. sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sampleRate))) #Encoding for new sound part. encoding = fastgen.encode(audio, 'model.ckpt-200000', sample_length) print(encoding.shape) np.save(file_name + '.npy', encoding) fig, axs = plt.subplots(2, 1, figsize=(10, 5)) axs[0].plot(audio) axs[0].set_title('Audio Signal') axs[1].plot(encoding[0]) axs[1].set_title('NSynth Encoding') #synthesis fastgen.synthesize(encoding, save_paths=['gen_' + file_name], samples_per_save=sample_length)
def main(unused_argv=None): tf.logging.set_verbosity(FLAGS.log) if FLAGS.checkpoint_path: checkpoint_path = utils.shell_path(FLAGS.checkpoint_path) else: expdir = utils.shell_path(FLAGS.expdir) tf.logging.info("Will load latest checkpoint from %s.", expdir) while not tf.gfile.Exists(expdir): tf.logging.fatal("\tExperiment save dir '%s' does not exist!", expdir) sys.exit(1) try: checkpoint_path = tf.train.latest_checkpoint(expdir) except tf.errors.NotFoundError: tf.logging.fatal("There was a problem determining the latest checkpoint.") sys.exit(1) if not tf.train.checkpoint_exists(checkpoint_path): tf.logging.fatal("Invalid checkpoint path: %s", checkpoint_path) sys.exit(1) tf.logging.info("Will restore from checkpoint: %s", checkpoint_path) source_path = utils.shell_path(FLAGS.source_path) tf.logging.info("Will load Wavs from %s." % source_path) save_path = utils.shell_path(FLAGS.save_path) tf.logging.info("Will save embeddings to %s." % save_path) if not tf.gfile.Exists(save_path): tf.logging.info("Creating save directory...") tf.gfile.MakeDirs(save_path) sample_length = FLAGS.sample_length batch_size = FLAGS.batch_size def is_wav(f): return f.lower().endswith(".wav") wavfiles = sorted([ os.path.join(source_path, fname) for fname in tf.gfile.ListDirectory(source_path) if is_wav(fname) ]) for start_file in xrange(0, len(wavfiles), batch_size): batch_number = (start_file / batch_size) + 1 tf.logging.info("On file number %s (batch %d).", start_file, batch_number) end_file = start_file + batch_size wavefiles_batch = wavfiles[start_file:end_file] # Ensure that files has batch_size elements. batch_filler = batch_size - len(wavefiles_batch) wavefiles_batch.extend(batch_filler * [wavefiles_batch[-1]]) wav_data = np.array( [utils.load_audio(f, sample_length) for f in wavefiles_batch]) try: tf.reset_default_graph() # Load up the model for encoding and find the encoding encoding = encode(wav_data, checkpoint_path, sample_length=sample_length) if encoding.ndim == 2: encoding = np.expand_dims(encoding, 0) tf.logging.info("Encoding:") tf.logging.info(encoding.shape) tf.logging.info("Sample length: %d" % sample_length) for num, (wavfile, enc) in enumerate(zip(wavefiles_batch, encoding)): filename = "%s_embeddings.npy" % wavfile.split("/")[-1].strip(".wav") with tf.gfile.Open(os.path.join(save_path, filename), "w") as f: np.save(f, enc) if num + batch_filler + 1 == batch_size: break except Exception as e: tf.logging.info("Unexpected error happened: %s.", e) raise
def load_audio(self): self.audio = utils.load_audio( self.fname, sample_length=self.sample_length, sr=self.sr )
import os import numpy as np from magenta.models.nsynth import utils from magenta.models.nsynth.wavenet import fastgen filename = '/data/input/battle1.wav' sr = 44100 audio = utils.load_audio(filename, sample_length=(sr * 4), sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) encoding = fastgen.encode(audio, '/data/model/wavenet-ckpt/model.ckpt-200000', sample_length) print(encoding.shape) np.save(filename.replace('.wav', '') + '_encoded.npy', encoding) fastgen.synthesize( encoding, save_paths=['/data/output/test.wav'], samples_per_save=sample_length, checkpoint_path="/data/model/wavenet-ckpt/model.ckpt-200000")
def load_encoding(fname, sample_length=None, sr=16000, ckpt=MODEL): audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) encoding = fastgen.encode(audio, ckpt, sample_length) return audio, encoding
sample_length = 512 encoding_sine = np.load('wavetable_sine.npy') encoding_tri = np.load('wavetable_tri.npy') encoding_saw = np.load('wavetable_saw.npy') #sawsin for i in range(1, 100): filename = '../prerender/SawSin/SawSin_0.' + '%02d.txt' % i time0 = time.time() print('decoding saw+sine interpolation:' + '%02d' % i) fastgen.synthesize((encoding_saw * (100 - i) + encoding_sine * i) / 100, save_paths=['tmp'], checkpoint_path='Model/wavenet-ckpt/model.ckpt-200000', samples_per_save=sample_length) audio = utils.load_audio('tmp', sample_length=512, sr=16000) np.savetxt(filename, [np.around(audio, decimals=5)], delimiter=',', fmt='%1.5f') print(time.time() - time0) #sintri for i in range(1, 100): filename = '../prerender/SinTri/SinTri_0.' + '%02d.txt' % i time0 = time.time() print('decoding sine+tri interpolation:' + '%02d' % i) fastgen.synthesize((encoding_sine * (100 - i) + encoding_tri * i) / 100, save_paths=['tmp'], checkpoint_path='Model/wavenet-ckpt/model.ckpt-200000', samples_per_save=sample_length) audio = utils.load_audio('tmp', sample_length=512, sr=16000)
def load_encoding(fname, sample_length=None, sr=16000, ckpt='../wavenet-ckpt/model.ckpt-200000'): audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) encoding = fastgen.encode(audio, ckpt, sample_length) return audio, encoding
import os import numpy as np from magenta.models.nsynth import utils from magenta.models.nsynth.wavenet import fastgen fname = 'aggression.wav' sr = 16000 audio = utils.load_audio(fname, sample_length=40000, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) encoding = fastgen.encode(audio, os.path.abspath('model.ckpt-200000'), sample_length) print(encoding.shape) np.save(fname + '.npy', encoding) fastgen.synthesize(encoding, save_paths=['gen_' + fname], samples_per_save=sample_length) sr = 16000 synthesis = utils.load_audio('gen_' + fname, sample_length=sample_length, sr=sr) print('Magenta Test')
def load_encoding(fname): audio = utils.load_audio(fname, sample_length=sl, sr=sr) print 'Encoding.. ', fname encoding = fastgen.encode(audio, ckpt, sl) print 'Encoded successfully' return audio, encoding
def unused(): # from https://www.freesound.org/people/MustardPlug/sounds/395058/ fname = '395058__mustardplug__breakbeat-hiphop-a4-4bar-96bpm.wav' sr = 16000 audio = utils.load_audio(fname, sample_length=40000, sr=sr) sample_length = audio.shape[0] print('{} samples, {} seconds'.format(sample_length, sample_length / float(sr))) encoding = fastgen.encode(audio, 'model.ckpt-200000', sample_length) print(encoding.shape) np.save(fname + '.npy', encoding) fig, axs = plt.subplots(2, 1, figsize=(10, 5)) axs[0].plot(audio); axs[0].set_title('Audio Signal') axs[1].plot(encoding[0]); axs[1].set_title('NSynth Encoding') fastgen.synthesize(encoding, save_paths=['gen_' + fname], samples_per_save=sample_length) sr = 16000 synthesis = utils.load_audio('gen_' + fname, sample_length=sample_length, sr=sr) def load_encoding(fname, sample_length=None, sr=16000, ckpt='model.ckpt-200000'): audio = utils.load_audio(fname, sample_length=sample_length, sr=sr) encoding = fastgen.encode(audio, ckpt, sample_length) return audio, encoding # from https://www.freesound.org/people/maurolupo/sounds/213259/ fname = '213259__maurolupo__girl-sings-laa.wav' sample_length = 32000 audio, encoding = load_encoding(fname, sample_length) fastgen.synthesize( encoding, save_paths=['gen_' + fname], samples_per_save=sample_length) synthesis = utils.load_audio('gen_' + fname, sample_length=sample_length, sr=sr) # use image interpolation to stretch the encoding: (pip install scikit-image) from skimage.transform import resize def timestretch(encodings, factor): min_encoding, max_encoding = encoding.min(), encoding.max() encodings_norm = (encodings - min_encoding) / (max_encoding - min_encoding) timestretches = [] for encoding_i in encodings_norm: stretched = resize(encoding_i, (int(encoding_i.shape[0] * factor), encoding_i.shape[1]), mode='reflect') stretched = (stretched * (max_encoding - min_encoding)) + min_encoding timestretches.append(stretched) return np.array(timestretches) # from https://www.freesound.org/people/MustardPlug/sounds/395058/ fname = '395058__mustardplug__breakbeat-hiphop-a4-4bar-96bpm.wav' sample_length = 40000 audio, encoding = load_encoding(fname, sample_length) audio = utils.load_audio('gen_slower_' + fname, sample_length=None, sr=sr) Audio(audio, rate=sr) encoding_slower = timestretch(encoding, 1.5) encoding_faster = timestretch(encoding, 0.5) fig, axs = plt.subplots(3, 1, figsize=(10, 7), sharex=True, sharey=True) axs[0].plot(encoding[0]); axs[0].set_title('Encoding (Normal Speed)') axs[1].plot(encoding_faster[0]); axs[1].set_title('Encoding (Faster))') axs[2].plot(encoding_slower[0]); axs[2].set_title('Encoding (Slower)') fastgen.synthesize(encoding_faster, save_paths=['gen_faster_' + fname]) fastgen.synthesize(encoding_slower, save_paths=['gen_slower_' + fname]) sample_length = 80000 # from https://www.freesound.org/people/MustardPlug/sounds/395058/ aud1, enc1 = load_encoding('395058__mustardplug__breakbeat-hiphop-a4-4bar-96bpm.wav', sample_length) # from https://www.freesound.org/people/xserra/sounds/176098/ aud2, enc2 = load_encoding('176098__xserra__cello-cant-dels-ocells.wav', sample_length) enc_mix = (enc1 + enc2) / 2.0 fig, axs = plt.subplots(3, 1, figsize=(10, 7)) axs[0].plot(enc1[0]); axs[0].set_title('Encoding 1') axs[1].plot(enc2[0]); axs[1].set_title('Encoding 2') axs[2].plot(enc_mix[0]); axs[2].set_title('Average') fastgen.synthesize(enc_mix, save_paths='mix.wav') def fade(encoding, mode='in'): length = encoding.shape[1] fadein = (0.5 * (1.0 - np.cos(3.1415 * np.arange(length) / float(length)))).reshape(1, -1, 1) if mode == 'in': return fadein * encoding else: return (1.0 - fadein) * encoding fig, axs = plt.subplots(3, 1, figsize=(10, 7)) axs[0].plot(enc1[0]); axs[0].set_title('Original Encoding') axs[1].plot(fade(enc1, 'in')[0]); axs[1].set_title('Fade In') axs[2].plot(fade(enc1, 'out')[0]); axs[2].set_title('Fade Out') def crossfade(encoding1, encoding2): return fade(encoding1, 'out') + fade(encoding2, 'in') fig, axs = plt.subplots(3, 1, figsize=(10, 7)) axs[0].plot(enc1[0]); axs[0].set_title('Encoding 1') axs[1].plot(enc2[0]); axs[1].set_title('Encoding 2') axs[2].plot(crossfade(enc1, enc2)[0]); axs[2].set_title('Crossfade') fastgen.synthesize(crossfade(enc1, enc2), save_paths=['crossfade.wav'])
def synthesize(source_file, checkpoint_path="model.ckpt-200000", out_file="synthesis.wav", sample_length=64000, samples_per_save=1000): """Resynthesize an input audio file. Args: source_file: Location of a wave or .npy file to load. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] out_file: Location to save the synthesized wave file. [synthesis.wav] sample_length: Length of file to synthesize. [source_file.length] samples_per_save: Save a .wav after every amount of samples. Raises: RuntimeError: Source_file should be .wav or .npy. """ if source_file.endswith(".npy"): encoding = np.load(source_file) hop_length = Config().ae_hop_length elif source_file.endswith(".wav"): # Audio to resynthesize wav_data = utils.load_audio(source_file, sample_length, sr=16000) # Load up the model for encoding and find the encoding encoding, hop_length = encode(wav_data, checkpoint_path, sample_length=sample_length) if encoding.ndim == 3: encoding = encoding[0] else: raise RuntimeError("File must be .wav or .npy") # Get lengths encoding_length = encoding.shape[0] total_length = encoding_length * hop_length session_config = tf.ConfigProto(allow_soft_placement=True) with tf.Graph().as_default(), tf.Session(config=session_config) as sess: net = load_fastgen_nsynth() saver = tf.train.Saver() saver.restore(sess, checkpoint_path) # initialize queues w/ 0s sess.run(net["init_ops"]) # Regenerate the audio file sample by sample wav_synth = np.zeros((total_length, ), dtype=np.float32) audio = np.float32(0) for sample_i in range(total_length): enc_i = sample_i // hop_length pmf = sess.run([net["predictions"], net["push_ops"]], feed_dict={ net["X"]: np.atleast_2d(audio), net["encoding"]: encoding[enc_i] })[0] sample_bin = sample_categorical(pmf) audio = utils.inv_mu_law_numpy(sample_bin - 128) wav_synth[sample_i] = audio if sample_i % 100 == 0: tf.logging.info("Sample: %d" % sample_i) if sample_i % samples_per_save == 0: wavfile.write(out_file, 16000, wav_synth) wavfile.write(out_file, 16000, wav_synth)
batch_counter = 0 visual_batch = [None]*batch_size audio_batch = [None]*batch_size visual_feat_all = [] audio_feat_all = [] xxx = 0 for i in file_name: if xxx ==1: break xxx += 1 visual_feat = np.load(gan_folder + 'label_' + i + '.npy') fname = audio_folder + "GOPR" + i + '.MP4.wav' sr = 16000 audio = utils.load_audio(fname, sample_length=-1, sr=sr) sample_length = audio.shape[0] spec = utils.specgram(audio, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False) mag = spec[:,:,0] dphase = spec[:,:,1] combine = np.concatenate((dphase, mag), axis=0) combineT = np.transpose(combine) visual_feat = visual_feat.reshape(visual_feat.shape[0],visual_feat.shape[2])
def load_wav(file_path): return utils.load_audio(file_path)
import tensorflow as tf, os, numpy as np, matplotlib.pyplot as plt, time from magenta.models.nsynth import utils from magenta.models.nsynth.wavenet import fastgen from IPython.display import Audio #%matplotlib inline #%config InlineBackend.figure_format = 'jpg' fname = "03 Plimsoll Punks.wav" ckpt = "model.ckpt-200000" sr = 16000 audio = utils.load_audio(fname, sample_length=16000, sr=sr) sample_length = audio.shape[0] print ("{} samples , {} seconds".format(sample_length, sample_length/float(sr))) encoding = fastgen.encode(audio, ckpt, sample_length) print(encoding.shape) np.save(fname.split(".")[0] + ".npy", encoding) fig, axs = plt.subplots(2, 1, figsize=(10, 5)) axs[0].plot(audio); axs[0].set_title("Audio Signal") axs[1].plot(encoding[0]); axs[1].set_title("NSynth Encoding") # Verify fast to generate encoding fastgen.synthesize(encoding, save_paths=["gen_" + fname], samples_per_save=sample_length)