def test1(self): audio, output_audio = make_sine_waves() audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32) output_audio_tensor = tf.convert_to_tensor(output_audio, dtype=tf.float32) input_batch = mu_law_encode(audio_tensor, 256) output_batch = mu_law_encode(output_audio_tensor, 256) encoded = self.net._one_hot(input_batch) output_encoded = self.net._one_hot(output_batch) shifted = tf.slice(output_encoded, [0, 1, 0], [-1, tf.shape(output_encoded)[1] - 1, -1]) # shifted = tf.pad(shifted, [[0, 0], [0, 1], [0, 0]]) raw_output = self.net._create_network(encoded) out = tf.reshape(raw_output, [-1, self.net.quantization_channels]) # Cast to float64 to avoid bug in TensorFlow proba = tf.cast(tf.nn.softmax(tf.cast(out, tf.float64)), tf.float32) last = tf.slice(proba, [tf.shape(proba)[0] - 1, 0], [1, self.net.quantization_channels]) lasted = tf.reshape(last, [-1]) # shifted = tf.pad(shifted, [[0, 0], [0, 1], [0, 0]]) # slice = tf.reshape(shifted, [-1, self.net.quantization_channels]) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) print(sess.run(out).shape) print(sess.run(proba)[1]) print(sess.run(proba)[0]) print(sess.run(last).shape) print(sess.run(lasted).shape)
def load_audio_not_one_hot( filename, sample_rate=get_model_params('SAMPLE_RATE'), quantization_channels=get_model_params('QUANTIZATION_CHANNELS'), batch_size=get_model_params('BATCH_SIZE')): audio = load_wav(filename, sample_rate) quantized = mu_law_encode(audio, quantization_channels) return quantized
def create_seed(filename, sample_rate, quantization_channels, window_size): audio, _ = librosa.load(filename, sr=sample_rate, mono=True) quantized = mu_law_encode(audio, quantization_channels) cut_index = tf.cond( tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def testEncodeDecode(self): x = np.linspace(-1, 1, 1000).astype(np.float32) channels = 256 # Test whether decoded signal is roughly equal to # what was encoded before with self.test_session() as sess: encoded = mu_law_encode(x, channels) x1 = sess.run(mu_law_decode(encoded, channels)) self.assertAllClose(x, x1, rtol=1e-1, atol=0.05) # Make sure that re-encoding leaves the waveform invariant with self.test_session() as sess: encoded = mu_law_encode(x1, channels) x2 = sess.run(mu_law_decode(encoded, channels)) self.assertAllClose(x1, x2)
def testEndToEndTraining(self): audio = make_sine_waves() np.random.seed(42) # if self.generate: # librosa.output.write_wav('/tmp/sine_train.wav', audio, # SAMPLE_RATE_HZ) # power_spectrum = np.abs(np.fft.fft(audio))**2 # freqs = np.fft.fftfreq(audio.size, SAMPLE_PERIOD_SECS) # indices = np.argsort(freqs) # indices = [index for index in indices if freqs[index] >= 0 and # freqs[index] <= 500.0] # plt.plot(freqs[indices], power_spectrum[indices]) # plt.show() audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32) encode_output = mu_law_encode(audio_tensor, QUANTIZATION_CHANNELS) loss = self.net.loss(encode_output) optimizer = optimizer_factory[self.optimizer_type]( learning_rate=self.learning_rate, momentum=self.momentum) trainable = tf.trainable_variables() optim = optimizer.minimize(loss, var_list=trainable) init = tf.initialize_all_variables() generated_waveform = None max_allowed_loss = 0.1 loss_val = max_allowed_loss initial_loss = None with self.test_session() as sess: sess.run(init) initial_loss = sess.run(loss) for i in range(TRAIN_ITERATIONS): loss_val, _ = sess.run([loss, optim]) # if i % 10 == 0: # print("i: %d loss: %f" % (i, loss_val)) # Sanity check the initial loss was larger. self.assertGreater(initial_loss, max_allowed_loss) # Loss after training should be small. self.assertLess(loss_val, max_allowed_loss) # Loss should be at least two orders of magnitude better # than before training. self.assertLess(loss_val / initial_loss, 0.01) # saver = tf.train.Saver(var_list=tf.trainable_variables()) # saver.save(sess, '/tmp/sine_test_model.ckpt', global_step=i) if self.generate: # Check non-incremental generation generated_waveform = generate_waveform(sess, self.net, False) check_waveform(self.assertGreater, generated_waveform) # Check incremental generation generated_waveform = generate_waveform(sess, self.net, True) check_waveform(self.assertGreater, generated_waveform)
def testEncodeNegativeChannelSize(self): np.random.seed(1944) # For repeatability of test. channels = -256 number_of_samples = 1024 x = np.zeros(number_of_samples).astype(np.float32) manual_encode = manual_mu_law_encode(x, channels) with self.test_session() as sess: self.assertRaises(TypeError, sess.run(mu_law_encode(x, channels)))
def testEncodeUniformRandomNoise(self): np.random.seed(42) # For repeatability of test. channels = 256 number_of_samples = 2048 x = np.random.uniform(-1, 1, number_of_samples).astype(np.float32) manual_encode = manual_mu_law_encode(x, channels) with self.test_session() as sess: encode = sess.run(mu_law_encode(x, channels)) self.assertAllEqual(manual_encode, encode)
def testEncodeZeros(self): np.random.seed(1944) # For repeatability of test. channels = 256 number_of_samples = 1024 x = np.zeros(number_of_samples).astype(np.float32) manual_encode = manual_mu_law_encode(x, channels) with self.test_session() as sess: encode = sess.run(mu_law_encode(x, channels)) self.assertAllEqual(manual_encode, encode)
def testEncodePrecomputed(self): channels = 256 number_of_samples = 10 x = np.array([-1.0, 1.0, 0.6, -0.25, 0.01, 0.33, -0.9999, 0.42, 0.1, -0.45]).astype(np.float32) encoded_manual = np.array([0, 255, 243, 32, 157, 230, 0, 235, 203, 18]).astype(np.int32) with self.test_session() as sess: encoded = sess.run(mu_law_encode(x, channels)) self.assertAllEqual(encoded_manual, encoded)
def testEncodeRamp(self): np.random.seed(1944) # For repeatability of test. channels = 256 number_of_samples = 1024 number_of_steps = 2.0 / number_of_samples x = np.arange(-1.0, 1.0, number_of_steps).astype(np.float32) manual_encode = manual_mu_law_encode(x, channels) with self.test_session() as sess: encode = sess.run(mu_law_encode(x, channels)) self.assertAllEqual(manual_encode, encode)
def create_seed(filename, sample_rate, quantization_channels, window_size=WINDOW): audio, _ = librosa.load(filename, sr=sample_rate, mono=True) audio = audio_reader.trim_silence(audio) quantized = mu_law_encode(audio, quantization_channels) cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def create_seed(waveform, sample_rate, quantization_channels, window_size, silence_threshold=SILENCE_THRESHOLD): #audio, _ = librosa.load(filename, sr=sample_rate, mono=True) #audio = audio_reader.trim_silence(audio, silence_threshold) quantized = mu_law_encode(waveform, quantization_channels) # explaning lambda function this is the condition if true return size of quantized else return const size (varies) cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def testEncodePrecomputed(self): channels = 256 number_of_samples = 10 x = np.array( [-1.0, 1.0, 0.6, -0.25, 0.01, 0.33, -0.9999, 0.42, 0.1, -0.45]).astype(np.float32) encoded_manual = np.array([0, 255, 243, 32, 157, 230, 0, 235, 203, 18]).astype(np.int32) with self.test_session() as sess: encoded = sess.run(mu_law_encode(x, channels)) self.assertAllEqual(encoded_manual, encoded)
def create_seed(filename, sample_rate, quantization_channels, window_size, silence_threshold=SILENCE_THRESHOLD): audio, _ = librosa.load(filename, sr=sample_rate, mono=True) #audio = audio_reader.trim_silence(audio, silence_threshold) quantized = mu_law_encode(audio, quantization_channels) cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def testDecodeEncode(self): # generate every possible quantized level. x = np.array(range(QUANT_LEVELS), dtype=np.int) # Encoded then decode every value. with self.test_session() as sess: # Decode into floating-point scalar. decoded = mu_law_decode(x, QUANT_LEVELS) # Encode back into an integer quantization level. encoded = mu_law_encode(decoded, QUANT_LEVELS) round_tripped = sess.run(encoded) # decoding then encoding every level should produce what we started # with. self.assertAllEqual(x, round_tripped)
def testEncodeDecodeShift(self): x = np.linspace(-1, 1, 1000).astype(np.float32) with self.test_session() as sess: encoded = mu_law_encode(x, QUANT_LEVELS) decoded = mu_law_decode(encoded, QUANT_LEVELS) roundtripped = sess.run(decoded) # Detect non-unity scaling and non-zero shift in the roundtripped # signal by asserting that slope = 1 and y-intercept = 0 of line fit to # roundtripped vs x values. coeffs = np.polyfit(x, roundtripped, 1) slope = coeffs[0] y_intercept = coeffs[1] EPSILON = 1e-4 self.assertNear(slope, 1.0, EPSILON) self.assertNear(y_intercept, 0.0, EPSILON)
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input): # seed의 앞부분만 사용한다. seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True) seed_audio = audio.trim_silence(seed_audio, default_hparams) if scalar_input: if len(seed_audio) < window_size: return seed_audio else: return seed_audio[:window_size] else: quantized = mu_law_encode(seed_audio, quantization_channels) # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나??? cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def create_seed(filename, sample_rate, quantization_channels, window_size, scalar_input, silence_threshold=SILENCE_THRESHOLD): audio, _ = librosa.load(filename, sr=sample_rate, mono=True) audio = audio_reader.trim_silence(audio, silence_threshold) if scalar_input: if len(audio) < window_size: return audio else: return audio[:window_size] else: quantized = mu_law_encode(audio, quantization_channels) # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나??? cut_index = tf.cond( tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
x for x in os.listdir(DIRS['SONGS']) if x.endswith('.wav') ][0]) wav_fname_new = wav_fname.replace('.wav', '_after.wav') # low raw audio # In[14]: audio, _ = librosa.load(wav_fname, sr=M_PARAMS['SAMPLE_RATE'], mono=True) audio[1000:1050] # encode it to 8 bit amplitude # In[15]: quantized = mu_law_encode(audio, M_PARAMS['QUANTISATION_CHANNELS']) quantized[1000:1050].eval(session=sess) # get RNN input # In[16]: quantized_oh = _one_hot(quantized) quantized_oh[0][1000:1020].eval(session=sess) # let RNN out be exact RNN input (for test) # # turn it back to 8 bit signal # In[17]:
def testEncodeIsSurjective(self): x = np.linspace(-1, 1, 10000).astype(np.float32) channels = 123 with self.test_session() as sess: encoded = sess.run(mu_law_encode(x, channels)) self.assertEqual(len(np.unique(encoded)), channels)