hop_length = dataset['hop_length'][()] sample_rate = dataset['sample_rate'][()] #The prediction sliding window will be the array that the model is made to predict new frames from. mags = X[0] mags = np.append(mags, Y, axis=0) print("Dataset Y Shape: ", Y.shape) print("Mags Shape: ", mags.shape) print('Synthesising Audio') #Generate some phases for evey new frame of predicted magnitdues using the phase gen.py library. phases = phase_gen.gen_phases(mags.shape[0], fftFrameSize, hop_length, sample_rate) print('Generated New Phases', phases.shape) #convert all of the predicted magnitudes and generated phases back into samples with the correct hop length. audio = phase_gen.fft2samples(mags, phases, hop_length) #this is to stop librosa from saving the .wavs as 64-bit floats with no one can read and normalising it here because librosa's write to wav only normalises floats. maxv = np.iinfo(np.int16).max audio_wav = (librosa.util.normalize(audio) * maxv).astype(np.int16) #Create a unique name and directory for the new audiofiles and write it to wav. if not os.path.exists('datasets/phased'): os.makedirs('datasets/phased') audio_name = dataset_name+'_'+'phased'+'_'+str(it_i)+'.wav' while os.path.exists('datasets/phased/'+audio_name): it_i += 1 audio_name = dataset_name+'_'+'phased'+'_'+str(it_i)+'.wav'
if training: model.fit(train_x, train_y, validation_set=((valid_x, valid_y)), show_metric=True, batch_size=batch_size, n_epoch=training_iters, snapshot_epoch=False, snapshot_step=1000, run_id=tf_id, callbacks=callback) else: model.load(cp) for i in xrange(amount_generated_sequences): random_index = random.randint(0, (len(train_x) - 1)) impulse = np.array(train_x[random_index]) predicted_magnitudes = impulse for j in xrange(sequence_max_length): impulse = np.array(impulse).reshape(1, x_frames.shape[1], x_frames.shape[2]) prediction = model.predict(impulse) predicted_magnitudes = np.vstack((predicted_magnitudes, prediction)) impulse = predicted_magnitudes[-sequence_length:] predicted_magnitudes = np.array(predicted_magnitudes) print i, predicted_magnitudes.shape phases = phase_gen.gen_phases(predicted_magnitudes.shape[0], fft_size, hop_size, sample_rate) audio = phase_gen.fft2samples(predicted_magnitudes, phases, hop_size) maxv = np.iinfo(np.int16).max audio_wav = (librosa.util.normalize(audio) * maxv).astype(np.int16) audio_name = tf_id + u'_' + unicode(i) + u'.wav' librosa.output.write_wav(audio_path + u'/' + audio_name, audio_wav, sample_rate, norm=False)
#For every frame of magnitudes for i in range(magnitude_t.shape[0]): #Append the frame to the fft bank. fft_bank.append(magnitude_t[i]) #Convert the fft bank list into a numpy array. fft_bank = np.array(fft_bank) print('Synthesising Audio') #Generate some phases for evey new frame of predicted magnitdues using the phase gen.py library. phases = phase_gen.gen_phases(fft_bank.shape[0], fftFrameSize, hop_length, sample_rate) print('Generated New Phases', phases.shape) #convert all of the predicted magnitudes and generated phases back into samples with the correct hop length. audio = phase_gen.fft2samples(fft_bank, phases, hop_length) #this is to stop librosa from saving the .wavs as 64-bit floats with no one can read and normalising it here because librosa's write to wav only normalises floats. maxv = np.iinfo(np.int16).max audio_wav = (librosa.util.normalize(audio) * maxv).astype(np.int16) #Add one on to the user defined sequence length. sequence_length = sequence_length + 1 sequences = [] #Make a bunch of sequences of the FFT bank that are one more than the sequence length. for i in range(len(fft_bank) - sequence_length): sequences.append(fft_bank[i:i + sequence_length]) sequences = np.array(sequences)