def invert_to_sound (specname): specinfo = np.load(specname) original = utils.ispecgram(specinfo, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False, num_iters=1000) plt.plot(original) plt.show() # scale the original array to audible sound write(specname +'Inverse.wav', 16000, original)
# samples = sess.run(G_sample, feed_dict={Z: sample_Z(16, Z_dim)}) # # fig = plot(samples) # # plt.savefig('out/{}.png'.format(str(i).zfill(3)), bbox_inches='tight') # i += 1 # # plt.close(fig) # every 1000 iter test if it % 1000 == 0: samples = sess.run(G_sample, feed_dict={Z: sample_Z('3230')}) samples = np.transpose(samples) XXX = concat_to_spec(samples) ori = utils.ispecgram(XXX, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False, num_iters=1000) write('out/{}.wav'.format(str(i).zfill(3)), 16000, ori) i += 1 # print samples.shape # fig = plot(samples) # plt.savefig('out/{}.png'.format(str(i).zfill(3)), bbox_inches='tight') # i += 1 # plt.close(fig) V_mb, A_mb = data_gen.next() # print 'V_mb', V_mb
def main(): video_all = [ '3229', '3230', '3231', '3232', '3233', '3234', '3235', '3236', '3237' ] save_folder = '/home/yuanxin/code/bitplanes-tracking/' for video_index in video_all: data_folder = '/home/yuanxin/Downloads/output/test_' + video_index specname = './spec.npy' centername = './center.npy' os.chdir(data_folder) feat_map = [] for file in sorted(glob.glob("*.prob")): print file f = open(file, "rb") size = array.array("i") # L is the typecode for uint32 size.fromfile(f, 5) f.seek(20, os.SEEK_SET) total = size[0] * size[1] * size[2] * size[3] * size[4] data = array.array("f") data.fromfile(f, total) data = np.array(data) data = data.reshape(size[0], size[1], size[2], size[3], size[4]) # print data.shape feat_map.append(data) f.close() feat_map = np.array(feat_map) max_index = [] for i in range(feat_map.shape[0]): max_index.append(np.argmax(feat_map[i][0].reshape(16))) os.chdir(save_folder) audioname = './VGG_out/inverse_' + video_index + '.wav' X = np.load(specname) center = np.load(centername) label = max_index label = np.lib.pad(label, (7, 7), 'constant', constant_values=(label[0], label[-1])) # interp label # print X.shape # print center.shape # print label.shape pdb.set_trace() Y = center_to_spec(X[0:len(label)], center, label) LabelT = np.transpose(Y) print 'LabelT', LabelT.shape # reshape to the correct rainbowgram format XXX = concat_to_spec(LabelT) print 'XXX', XXX.shape ori = utils.ispecgram(XXX, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False, num_iters=1000) # plt.plot(ori) # plt.show() # scale the original array to audible sound write(audioname, 15360, ori)
def time_windowed_stft_transfer(content_file, style_file, window_seconds=5, hop_seconds=2.5): rate, content = wavfile.read(content_file) if total_seconds: content = content[:rate * total_seconds] total_samples = int(1.0 * params.sample_rate / rate * len(content)) print('Resampling content') content = resample(content, total_samples) content = content.astype(np.float32) librosa.output.write_wav('output/content.wav', sr=params.sample_rate, y=content.astype(np.int16), norm=False) x = np.random.normal(0, 1, total_samples) x = x.astype(np.float32) x /= np.max(np.abs(x)) x *= (amplitude - 1) # x *= 0.01 x = content unnormal_noise = x.flatten() librosa.output.write_wav('output/x.wav', sr=params.sample_rate, y=unnormal_noise.astype(np.int16), norm=False) rate, style = wavfile.read(style_file) if total_seconds: style = style[:rate * total_seconds] total_samples2 = int(len(style) / rate) * params.sample_rate print('Resampling style') style = resample(style, total_samples2) style = style.astype(np.float32) librosa.output.write_wav('output/style.wav', sr=params.sample_rate, y=style.astype(np.int16), norm=False) session = tf.Session() style_stft = specgram(style, n_fft=params.fft_size, log_mag=audio_utils.log, hop_length=256, mag_only=True) style_stft = style_stft.reshape([style_stft.shape[0], style_stft.shape[1]]) style_stft = style_stft.T style_stft = style_stft[np.newaxis, :, :] print('Style: ') print( f'Min: {np.min(style_stft):.4f}, Max: {np.max(style_stft):.4f}, Mean: {np.mean(style_stft):.4f}' ) content_stft = specgram(content, n_fft=params.fft_size, log_mag=audio_utils.log, hop_length=256, mag_only=True) content_stft = content_stft.reshape( [content_stft.shape[0], content_stft.shape[1]]) content_stft = content_stft.T content_stft = content_stft[np.newaxis, :, :] print('Content: ') print( f'Min: {np.min(content_stft):.4f}, Max: {np.max(content_stft):.4f}, Mean: {np.mean(content_stft):.4f}' ) result_stft = content_stft[:, :, :] hop_samples = int(content_stft.shape[1] / (len(content) / params.sample_rate) * hop_seconds) window_samples = int(content_stft.shape[1] / (len(content) / params.sample_rate) * window_seconds) transferrer = StftTransfer( session, (content_stft.shape[0], window_samples, content_stft.shape[2])) print('Starting window shifting') first = True for j in range(3): content_stft = result_stft[:] for i in range(0 if first else random.randint(0, hop_samples), result_stft.shape[1], hop_samples): start_time = time.time() # next_hop = int(hop_samples[i + 1]+1) if len(hop_samples) > i + 1 else content_stft.shape[1] current_content = content_stft[:, i:i + window_samples, :] current_initial = result_stft[:, i:i + window_samples, :] if current_content.shape[1] != window_samples: continue current_style = get_best_style(transferrer, current_initial[:], style_stft, hop_samples // 4) current_result_stft = transferrer.stft_transfer( content_stft=current_content, style_stft=current_style, initial_stft=current_initial, maxiter=20) result_stft[:, i:i + window_samples, :] = current_result_stft print( f'Epoch {j} last sample: {(i/result_stft.shape[1]):.4f} elapsed time: {(time.time() - start_time):.4f}' ) result = result_stft[0].T[:, :, np.newaxis] result = ispecgram(result, params.fft_size, hop_length=256, log_mag=audio_utils.log, mag_only=True, num_iters=20) result = np.clip(result, -1, 1) result *= amplitude # result = stft_transfer(content=content, style=style, initial=x) librosa.output.write_wav(f'output/result{j}.wav', sr=params.sample_rate, y=result.astype(np.int16), norm=False) print('Written result.wav') # librosa.output.write_wav('output/result.wav', sr=sample_rate, y=result.astype(np.int16), norm=False) # print('Written result.wav') first = False print('Inverting specgram') result = result_stft[0].T[:, :, np.newaxis] result = ispecgram(result, params.fft_size, hop_length=256, log_mag=audio_utils.log, mag_only=True, num_iters=100) result = np.clip(result, -1, 1) result *= amplitude # result = stft_transfer(content=content, style=style, initial=x) librosa.output.write_wav('output/result.wav', sr=params.sample_rate, y=result.astype(np.int16), norm=False) print('Written result.wav')