def run(content_fname, style_fname, output_fname, n_fft=2048, hop_length=256, alpha=0.02, n_layers=1, n_filters=8192, k_w=15, stride=1, iterations=300, phase_iterations=500, sr=22050, signal_length=1, # second block_length=1024): content, sr = read_audio_spectum( content_fname, n_fft=n_fft, hop_length=hop_length, sr=sr) style, sr = read_audio_spectum( style_fname, n_fft=n_fft, hop_length=hop_length, sr=sr) n_frames = min(content.shape[0], style.shape[0]) n_samples = content.shape[1] content = content[:n_frames, :] style = style[:n_frames, :] content_features, style_features, kernels = compute_features( content=content, style=style, stride=stride, n_layers=n_layers, n_filters=n_filters, k_w=k_w) result = compute_stylization( kernels=kernels, n_samples=n_samples, n_frames=n_frames, content_features=content_features, style_features=style_features, stride=stride, n_layers=n_layers, alpha=alpha, iterations=iterations) mags = np.zeros_like(content.T) mags[:, :n_frames] = np.exp(result[0, 0].T) - 1 p = 2 * np.pi * np.random.random_sample(mags.shape) - np.pi for i in range(phase_iterations): S = mags * np.exp(1j * p) x = librosa.istft(S, hop_length) p = np.angle(librosa.stft(x, n_fft, hop_length)) librosa.output.write_wav('prelimiter.wav', x, sr) limited = utils.limiter(x) librosa.output.write_wav(output_fname, limited, sr)
def run(content_fname, style_fname, output_fname, n_fft=4096, n_layers=1, n_filters=4096, hop_length=256, alpha=0.05, k_w=15, k_h=3, optimizer='bfgs', stride=1, iterations=300, sr=22050): frame_size = n_fft // 2 audio, fs = librosa.load(content_fname, sr=sr) content = chop(audio, hop_size=hop_length, frame_size=frame_size) audio, fs = librosa.load(style_fname, sr=sr) style = chop(audio, hop_size=hop_length, frame_size=frame_size) n_frames = min(content.shape[0], style.shape[0]) n_samples = min(content.shape[1], style.shape[1]) content = content[:n_frames, :n_samples] style = style[:n_frames, :n_samples] content_features, style_gram, kernels, freqs = compute_features( content=content, style=style, stride=stride, n_fft=n_fft, n_layers=n_layers, n_filters=n_filters, k_w=k_w, k_h=k_h) result = compute_stylization(kernels=kernels, freqs=freqs, n_samples=n_samples, n_frames=n_frames, n_fft=n_fft, content_features=content_features, style_gram=style_gram, stride=stride, n_layers=n_layers, alpha=alpha, optimizer=optimizer, iterations=iterations) s = unchop(result, hop_size=hop_length, frame_size=frame_size) librosa.output.write_wav(output_fname, s, sr=sr) s = utils.limiter(s) librosa.output.write_wav(output_fname + '.limiter.wav', s, sr=sr)
def run(content_fname, style_fname, output_path, model, iterations=100, sr=16000, hop_size=512, frame_size=2048, alpha=1e-3): content, fs = librosa.load(content_fname, sr=sr) style, fs = librosa.load(style_fname, sr=sr) n_samples = (min(content.shape[0], style.shape[0]) // 512) * 512 content = utils.chop(content[:n_samples], hop_size, frame_size) style = utils.chop(style[:n_samples], hop_size, frame_size) if model == 'encoder': content_features, style_features = compute_wavenet_encoder_features( content=content, style=style) result = compute_wavenet_encoder_stylization( n_frames=content_features[0].shape[0], n_samples=frame_size, alpha=alpha, content_features=content_features, style_features=style_features, iterations=iterations) elif model == 'decoder': content_features, style_features = compute_wavenet_decoder_features( content=content, style=style) result = compute_wavenet_decoder_stylization( n_frames=content_features[0].shape[0], n_samples=frame_size, alpha=alpha, content_features=content_features, style_features=style_features, iterations=iterations) else: raise ValueError('Unsupported model type: {}.'.format(model)) x = utils.unchop(result, hop_size, frame_size) librosa.output.write_wav('prelimiter.wav', x, sr) limited = utils.limiter(x) output_fname = '{}/{}+{}.wav'.format(output_path, content_fname.split('/')[-1], style_fname.split('/')[-1]) librosa.output.write_wav(output_fname, limited, sr=sr)