Exemplo n.º 1
0
 def process_sounds(self):
     '''
     processes dowloaded files below self.root after running download_files().
     DEPRECATED Don't use this for the pretrained VGGish!
     TODO: this should go to preprocessing if kept at all
     '''
     self.info_df = self.df[['gen', 'id']].copy()
     for path, dirs, files in os.walk(self.root):
         for file in files:
             if file.endswith('.mp3'):
                 y, sr = load(os.path.join(path, file))
                 if self.convert_to_wav:
                     write_wav(
                         os.path.join(path, file.replace('.mp3', '.wav')),
                         y, self.input_sr)
                 if self.make_mel_spec:
                     S = librosa.feature.melspectrogram(
                         y,
                         sr=self.sr,
                         n_mels=self.n_mels,
                         hop_length=self.hop_length)
                     log_S = librosa.amplitude_to_db(S, ref=np.max)
                     np.save(os.path.join(path, 'mel_spec.npy'), log_S)
                     if self.save_img:
                         scipy.misc.imsave(
                             os.path.join(path, 'mel_spec.jpg'), log_S)
                     if self.extract_chunks:
                         if log_S.shape[1] < self.len_chunks:
                             print(
                                 'recording {} has length {} which is shorter \
                                     than required chunk length.')
                             continue
                         self.spec_chunks(log_S, path=path)
     self.info_df.to_csv(os.path.join(self.root, 'info.csv'), sep='\t')
Exemplo n.º 2
0
def reconstruct(spectrogram):
    # remove the padding from the speech
    spectrogram = spectrogram[:feature_size, :feature_size].transpose()
    # including the real and imaginary components
    spectrogram = spectrogram[:257, :] + 1j * spectrogram[257:, :]
    # re-construct audio from spectrogram
    wav = istft(spectrogram)
    write_wav("target.wav", wav, sr=44100)
Exemplo n.º 3
0
    def __call__(self, n_samples, sample_length, cond, speaker):
        print('Generate', n_samples, 'of length', sample_length)
        samples = self.generate(n_samples, sample_length, cond,
                                speaker).cpu().numpy()
        for i in range(n_samples):
            print(self.filename)

            write_wav(self.filename, samples[i, :], sr=self.sample_rate)
Exemplo n.º 4
0
def test(direction=direction, began = began ,model_dir=model_dir, test_dir=test_dir, sr=sr, n_features=n_features, frame_period=frame_period) :
    
    outputs_dir = "./sample"
    
    if began == True :
        model = CycleBeGAN(num_features=n_features,mode="test")
        model.load(os.path.join(model_dir, "Cycle_BeGan")
    else :
        model = CycleGAN(num_features=n_features,mode="test")
        model.load(os.path.join(model_dir, "CycleGan"))
    
    mcep = np.load(os.path.join("./", 'mcep.npz'))
    mcep_mean_A = mcep['A_mean']
    mcep_std_A = mcep['A_std']
    mcep_mean_B = mcep['B_mean']
    mcep_std_B = mcep['B_std']

    logf0s = np.load(os.path.join("./", 'logf0s.npz'))
    logf0s_mean_A = logf0s['A_mean']
    logf0s_std_A = logf0s['A_std']
    logf0s_mean_B = logf0s['B_mean']
    logf0s_std_B = logf0s['B_std']
    
    if not os.path.exists(outputs_dir) :
        os.mkdir(outputs_dir)
    
    file_list = librosa.util.find_files(test_dir,ext="wav")
    
    for file in file_list :
        wav,_ = load(file, sr=sr)
        wav = wav_padding(wav = wav, sr = sr, frame_period = frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sr, frame_period = frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = sr, dim = n_features)
        coded_sp_transposed = coded_sp.T
        
        if direction == "A2B" :
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else : # B2A
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_B, std_log_src = logf0s_std_B, mean_log_target = logf0s_mean_A, std_log_target = logf0s_std_A)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A
            
        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sr)
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sr, frame_period = frame_period)
        write_wav(os.path.join(outputs_dir, os.path.basename(file)), wav_transformed, sr)


if __name__ == "__main__" :
    test(direction = direction)
    print("Done!")
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512):
    
    if os.path.isdir( PATH_INPUT):
        # 入力がディレクトリーの場合、ファイルリストをつくる
        filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True)
    else:
    	# 入力が単一ファイルの場合
        filelist_mixdown=[PATH_INPUT]
    print ('number of mixdown file', len(filelist_mixdown))
    
    # 出力用のディレクトリーがない場合は 作成する。
    _, path_output_ext = os.path.splitext(PATH_OUTPUT)
    print ('path_output_ext',path_output_ext)
    if len(path_output_ext)==0  and  not os.path.exists(PATH_OUTPUT):
        os.mkdir(PATH_OUTPUT)
    
    # モデルの読み込み
    unet = train.UNet()
    chainer.serializers.load_npz( MODEL,unet)
    config.train = False
    config.enable_backprop = False
    
    # ミックスされたものを読み込み、vocal(speech)の分離を試みる
    for fmixdown in filelist_mixdown:
        # audioread でエラーが発生した場合は、scipyを使う。
        try:
            y_mixdown, _ = load(fmixdown,  sr=SR, mono=True)
        except:
            sr_mixdown, y_mixdown = read(fmixdown)
            if not sr_mixdown == SR:
                y_mixdown = resample(y_mixdown, sr_mixdown, SR)
        
        # 入力の短時間スペクトラムを計算して、正規化する。
        spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE)
        mag = np.abs(spec)
        mag /= np.max(mag)
        phase = np.exp(1.j*np.angle(spec))
        print ('mag.shape', mag.shape)  
        start = 0
        end = 128 * (mag.shape[1] // 128)  # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。
        # speech(vocal)を分離するためのマスクを求める
        mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :]
        mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask))
        # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。
        mag2=mag[:, start:end]*mask 
        phase2=phase[:, start:end]
        y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE)
        
        # 分離した speech(vocal)を出力ファイルとして保存する。
        if len(path_output_ext)==0:
            # ディレクトリーへ出力
            foutname, _ = os.path.splitext( os.path.basename(fmixdown) )
            fname= os.path.join(PATH_OUTPUT, (foutname + '.wav'))
        else:
            # 指定されたファイルへ出力
            fname= PATH_OUTPUT
        print ('saving... ', fname)
        write_wav(fname, y, SR, norm=True)
Exemplo n.º 6
0
 def epoch(self, epoch_index):
     samples = self.generate(self.n_samples, self.sample_length) \
                   .cpu().float().numpy()
     for i in range(self.n_samples):
         write_wav(os.path.join(self.samples_path,
                                self.pattern.format(epoch_index, i + 1)),
                   samples[i, :],
                   sr=self.sample_rate,
                   norm=True)
Exemplo n.º 7
0
    def process_single_file(self, file_name):
        mixture, _ = load(os.path.join(self.input_dir, file_name + '.wav'),
                          sr=self.samplerate_hz)
        speaker_signals = self.separate_single_mixture(mixture)

        write_wav(os.path.join(self.output_dir, 's1', file_name + '.wav'), \
                  speaker_signals[0, :], self.samplerate_hz, norm=True)
        write_wav(os.path.join(self.output_dir, 's2', file_name + '.wav'), \
                  speaker_signals[1, :], self.samplerate_hz, norm=True)
Exemplo n.º 8
0
def generate_autodrive(vae,
                       dataset,
                       n_files=1,
                       out=None,
                       preprocessing=None,
                       transformOptions=None,
                       start="random",
                       n_start=10,
                       n_loops=10,
                       projections=None):
    for j in range(n_files):
        check_dir('%s/autodrive' % out)
        #sequence_length = loaded_data['script_args'].sequence
        # get starting point
        device = next(vae.parameters()).device
        if start == "file":
            input_file = random.randrange(len(dataset))
            data_in, _ = dataset[input_file]
            data_in = vae.format_input_data(preprocessing(
                data_in[:n_start])).unsqueeze(0)
            z_in = vae.encode(data_in)[-1]['out_params'].mean
        elif start == "random":
            latent_in = vae.platent[-1]['dim']
            # draw random point
            z0 = torch.distributions.Normal(torch.zeros(1, latent_in),
                                            torch.ones(1, latent_in)).sample()
            # draw direction
            u = torch.distributions.Normal(torch.zeros(1, latent_in),
                                           torch.ones(1, latent_in)).sample()
            increments = torch.linspace(0, 1e-1, n_start).unsqueeze(0)
            z_in = (z0 + increments.t() @ u).unsqueeze(0).to(device=device)

        with torch.no_grad():
            for n in range(n_loops):
                prediction_out = vae.prediction_module({'z_enc': [z_in]})
                z_in = torch.cat([z_in, prediction_out['out']], 1)
            data_out = vae.decode(z_in)[0]["out_params"].mean
        data_out = data_out.squeeze().cpu()

        # plot things
        fig = plt.figure()
        if len(data_out.shape) == 1:
            plt.plot(data_out)
        else:
            plt.imshow(data_out, aspect="auto")
        fig.savefig('%s/autodrive/drive_%d.pdf' % (out, j), format="pdf")
        plt.close('all')

        signal_out = inverseTransform(
            preprocessing.invert(data_out.squeeze().cpu().detach().numpy()),
            'stft', {'transformParameters': transformOptions},
            iterations=10,
            method='griffin-lim')
        write_wav('%s/autodrive/drive_%d.wav' % (out, j),
                  signal_out,
                  transformOptions.get('resampleTo', 22050),
                  norm=True)
Exemplo n.º 9
0
def main(infile, outfile):
    fftsize = 1024
    hopsize = 256

    data, sr = load(infile, sr=None)

    spec = stft(data, fftsize, hopsize, 'hanning')
    output = istft(spec, hopsize, 'hanning')
    write_wav(outfile, output, sr)
Exemplo n.º 10
0
def main(infile, outfile, dur):
    fftsize = 1024
    hopsize = 256

    data, sr = load(infile, sr=None, duration=dur)

    pv = pva(data, fftsize, hopsize, sr, 'hanning')
    y = pvs(pv, hopsize, sr, 'hanning')

    write_wav(outfile, y, sr)
Exemplo n.º 11
0
    def output_wav(self, folder, filename):
        '''
        Small function to output a Load or Slice to a wav file

        :param folder: (string)   | the folder to output to
        :param filename: (string) | filename to output as
        :return:
        '''
        audio = os.path.join(folder, filename)
        output.write_wav(audio, self.y, self.sr)
Exemplo n.º 12
0
def callOnFile(wav, commands, wavPath, melPath, scKwargs={}):
    '''call <commands> with <wav> write to <wavPath>, then collect results from <melPath>'''
    if not VERBOSE_OUT:
        scKwargs['stdout'] = subprocess.DEVNULL
    if not VERBOSE_ERR:
        scKwargs['stderr'] = subprocess.DEVNULL
    write_wav(wavPath, wav[0], wav[1])
    ret = subprocess.call(commands, **scKwargs)
    assert ret == 0, f'return value: {ret} != 0'
    times, pitches = load_time_series(melPath, delimiter=r'\s+|,')
    return times, pitches
Exemplo n.º 13
0
def wav_writer(samples,
               sample_rate,
               suffix,
               orig,
               newdir=None,
               subdir=None,
               verbose=True):
    '''
    Saves a wav in same place as original .wav file
    
    Inputs:
        samples: new samples to save
        orig: original filename of .wav file
        suffix: suffix for the new filename
        newdir: a new directory to use instead of the original wav file's path
        subdir: name of a subdirectory to make in the original or new directory
        verbose: whether or not to print filename
        
    Returns:
        the new filename
    '''

    filesplit = os.path.split(orig)

    # Get the path in which to save the wav
    if newdir:
        base_path = newdir
    else:
        base_path = filesplit[0]  #Same path as the original file

    if subdir:
        base_path = os.path.join(base_path, subdir)

    # Make path if necessary
    try:
        os.mkdir(base_path)
    except FileExistsError:
        pass

    # Get the name by which to save the wav
    file_name = filesplit[1]
    base_name = f'{os.path.splitext(file_name)[0]}_{suffix}.wav'

    # Full path & filename by which wav should be saved
    file_path = os.path.join(base_path, base_name)

    try:
        write_wav(file_path, np.array(samples), sample_rate)
    except ParameterError:  # librosa.util.exceptions.ParameterError
        print(f'Skipping {file_path} due to ParameterError')

    if verbose: print(f'Saved files to {file_path}')

    return file_path
Exemplo n.º 14
0
    def _get_batches_of_transformed_samples(self, index_array):
        print("Batch index:", self.batch_index)

        index_array.sort()
        # find max size in batch
        filtered_df = self.dataframe_data.loc[self.dataframe_data.index.isin(
            index_array)]
        bigfile_in_batch = filtered_df.loc[filtered_df[2].idxmax()]
        max_audiosize_in_batch = int(bigfile_in_batch[2])

        # when stretching slow down the audio we change max_audiosize_in_batch by stretch rate
        if self.audio_data_generator.stretch and (
                self.audio_data_generator.stretch < 1):
            max_audiosize_in_batch = int(
                max_audiosize_in_batch *
                (1 + self.audio_data_generator.stretch))

        # when shift is happens we change max_audiosize_in_batch accordingly
        if self.audio_data_generator.shift:
            _, max_sr = load_audio(bigfile_in_batch[1])
            max_audiosize_in_batch = int(max_audiosize_in_batch +
                                         (self.audio_data_generator.shift *
                                          max_sr))

        batch_x = np.zeros((len(index_array), ) + (max_audiosize_in_batch, ),
                           dtype=backend.floatx())
        batch_y = [0] * len(index_array)

        for i, j in enumerate(index_array):
            current_audiofile = self.dataframe_data.iloc[j]
            y = current_audiofile[0]
            x, sr = load_audio(current_audiofile[1])

            if len(x) < max_audiosize_in_batch:
                x = np.pad(x,
                           (0, max(0, int(max_audiosize_in_batch - len(x)))),
                           "constant",
                           constant_values=(self.stuffing))

            x, sr = self.audio_data_generator.transform(x, sr)

            # optionally save augmented audio to disk for debugging purposes
            if self.save_to_dir:
                fname = '{prefix}_{index}_{hash}.wav'.format(
                    prefix=self.save_prefix,
                    index=j,
                    hash=np.random.randint(1e7))
                write_wav(os.path.join(self.save_to_dir, fname), x, sr)

            batch_x[i] = x
            batch_y[i] = y

        return batch_x, batch_y
Exemplo n.º 15
0
def main(infile, outfile, pitch, scale, transpose, dur_ratio):
    fftsize = 1024
    hopsize = 256

    data, sr = load(infile, sr=None)

    pv = ifd(data, fftsize, hopsize, sr, 'hanning')
    if transpose:
        pv = np.flip(pv, 1)
    output = addsyn(pv, 0, pitch, scale, int(hopsize * dur_ratio), sr)

    write_wav(outfile, output, sr)
Exemplo n.º 16
0
 def epoch(self, epoch_index):
     samples = self.generate(self.n_samples, self.sample_length) \
                   .cpu().float().numpy()
     for i in range(self.n_samples):
         if self.save_raw:
             samples.tofile('debug_seq_{}.csv'.format(epoch_index),
                            sep=',',
                            format='%10.5f')
         write_wav(os.path.join(self.samples_path,
                                self.pattern.format(epoch_index, i + 1)),
                   samples[i, :],
                   sr=self.sample_rate,
                   norm=True)
Exemplo n.º 17
0
    def guess(self):
        wav, _ = load(self.WAVE_OUTPUT_FILENAME, sr=self.sr)
        wav, _ = trim(wav, top_db=self.top_db)
        write_wav(self.WAVE_OUTPUT_FILENAME, wav, self.sr)
        print(">> save as", self.WAVE_OUTPUT_FILENAME)

        #dtw recognition
        x = self.getMfcc(wav, self.sr)
        res = self.recognition(x)
        print(res)

        self.audio_num = self.audio_num + 1
        self.WAVE_OUTPUT_FILENAME = "./saved/" + str(self.audio_num) + ".wav"
Exemplo n.º 18
0
def separate_whole_audio_data():
    unet = analyze.UNet()
    mag, phase, length = analyze.load_audio('audio/wav/fhana.wav')
    data = [[], []]
    start = time()
    for i in range(0, mag.shape[1], 1024):
        mask = analyze.compute_mask(unet, mag[:, i:i+1024])
        data[0].extend(analyze.save_audio(mag[:, i:i+1024]*mask, phase[:, i:i+1024]))
        data[1].extend(analyze.save_audio(mag[:, i:i+1024]*(1-mask), phase[:, i:i+1024]))
    from librosa.output import write_wav
    for i in range(2):
        write_wav('data{0}.wav'.format(i), np.array(data[i][:length]), 16000, norm=True)
    print(time() - start)
Exemplo n.º 19
0
def create_sin_wave_data(data_dir, num_files, seq_len, sample_rate=16000, save_raw=False):

    os.makedirs(data_dir, exist_ok=True)
    dataset = get_batch(num_files, seq_len)
    for i in range(num_files):
        if save_raw:
            dataset[i].tofile('sin_{}.csv'.format(i),
                            sep=',', format='%7.5f')
        write_wav(
            os.path.join(
                data_dir, 'sin_{}.wav'.format(i)
            ),
            dataset[i, :], sr=sample_rate, norm=False
        )
Exemplo n.º 20
0
def save_sound(output_path, waveform, sample_rate, normalize=True):
    # save waveform to .wav sound file
    # example:
    #   save_sound('../output',output.wav,waveform,sample_rate)

    # ensure that output_dir exists
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)

    from librosa.output import write_wav
    write_wav(output_path, waveform, sr=sample_rate, norm=normalize)

    print(output_path, 'saved')

    return
Exemplo n.º 21
0
 def epoch(self, epoch_index):
     samples = self.generate(self.n_samples, self.sample_length) \
                   .cpu().float().numpy()
     print("__epoch__")
     print(self.trainer.stats)
     for i in range(self.n_samples):
         file_path = os.path.join(
             self.samples_path,
             sample_file_path(
                 epoch_index, self.trainer.iterations,
                 self.trainer.stats["training_loss"]["last"].tolist(), i))
         write_wav(file_path, samples[i, :], sr=self.sample_rate, norm=True)
         if self._upload is not None:
             self._upload(file_path)
Exemplo n.º 22
0
def SaveStereoAudio(fname, mag, phase, norm=True, save_path=None):
    y_l = istft(mag[0] * phase[0],
                hop_length=C.H,
                win_length=C.FFT_SIZE,
                window=C.WINDOW)
    y_r = istft(mag[1] * phase[1],
                hop_length=C.H,
                win_length=C.FFT_SIZE,
                window=C.WINDOW)
    stereo = np.array((y_l, y_r))
    if save_path is None:
        write_wav(C.PATH_MUSIC / fname, stereo, C.SR, norm=norm)
    else:
        write_wav(save_path / fname, stereo, C.SR, norm=norm)
Exemplo n.º 23
0
 def epoch(self, epoch_index):
     samples = self.generate(self.n_samples, self.sample_length) \
                   .cpu().float().numpy()
     print("__epoch__")
     print(self.trainer.stats)
     for i in range(self.n_samples):
         write_wav(os.path.join(
             self.samples_path,
             self.pattern.format(
                 epoch_index, self.trainer.iterations,
                 self.trainer.stats["training_loss"]["last"].tolist(),
                 i + 1)),
                   samples[i, :],
                   sr=self.sample_rate,
                   norm=True)
Exemplo n.º 24
0
def test():
    vis = Visualizer(env='svs')
    model = getattr(models, 'Unet')().eval()
    #    model.cuda()
    model.load_state_dict(
        t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth'))
    mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192)
    mix_wav_mag, mix_wav_phase = magphase(
        stft(mix_wav, n_fft=1024, hop_length=768))
    START = 700
    END = START + 128

    mix_wav_mag = mix_wav_mag[:, START:END]
    mix_wav_phase = mix_wav_phase[:, START:END]

    print(mix_wav_mag.shape)

    gg = mix_wav_mag[1:]
    gg = t.from_numpy(gg)
    gg.unsqueeze_(0)
    gg.unsqueeze_(0)
    vis.img('a', gg)
    print(gg.shape)
    with t.no_grad():
        gg = Variable(gg)
    score = model(gg)
    predict = gg.data * score.data
    print(predict.shape)
    target_pred_mag = predict.view(512, 128).cpu().numpy()
    target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag))
    vis.img('b', t.from_numpy(target_pred_mag))
    print(target_pred_mag.shape)
    write_wav(
        f'C:/Users/lenovo/Music/pred_vocal.wav',
        istft(
            target_pred_mag * mix_wav_phase
            #     (mix_wav_mag * target_pred_mag) * mix_wav_phase
            ,
            win_length=1024,
            hop_length=768),
        8192,
        norm=True)
    write_wav(f'C:/Users/lenovo/Music/pred_mix.wav',
              istft(mix_wav_mag * mix_wav_phase,
                    win_length=1024,
                    hop_length=768),
              8192,
              norm=True)
Exemplo n.º 25
0
def speedyspeech_tts(text_str, device_str):
    print('Loading model checkpoints')
    m = SpeedySpeech(device=device_str).load('models/speedyspeech.pth',
                                             device_str)
    m.eval()

    checkpoint = torch.load('models/melgan.pth', device_str)
    hp = HParam("mikuai/speedyspeech/melgan/config/default.yaml")
    melgan = Generator(hp.audio.n_mel_channels).to(device_str)
    melgan.load_state_dict(checkpoint["model_g"])
    melgan.eval(inference=False)

    print('Processing text')
    txt_processor = TextProcessor(HPText.graphemes,
                                  phonemize=HPText.use_phonemes)
    text = [text_str]

    phonemes, plen = txt_processor(text)
    # append more zeros - avoid cutoff at the end of the largest sequence
    phonemes = torch.cat((phonemes, torch.zeros(len(phonemes), 5).long()),
                         dim=-1)
    phonemes = phonemes.to('cpu')

    print('Synthesizing')
    # generate spectrograms
    with torch.no_grad():
        spec, durations = m((phonemes, plen))

    # invert to log(mel-spectrogram)
    spec = m.collate.norm.inverse(spec)

    # mask with pad value expected by MelGan
    msk = mask(spec.shape, durations.sum(dim=-1).long(), dim=1).to('cpu')
    spec = spec.masked_fill(~msk, -11.5129)

    # Append more pad frames to improve end of the longest sequence
    spec = torch.cat((spec.transpose(
        2, 1), -11.5129 * torch.ones(len(spec), HPStft.n_mel, 5).to('cpu')),
                     dim=-1)

    # generate audio
    with torch.no_grad():
        audio = melgan(spec).squeeze(1)

    print('Saving audio')
    # TODO: cut audios to proper length
    for i, a in enumerate(audio.detach().cpu().numpy()):
        write_wav(('output.wav'), a, HPStft.sample_rate, norm=False)
def main(infile, outfile):
    fftsize = 1024
    hopsize = 256

    data, sr = load(infile, sr=None)

    spec = stft(data, fftsize, hopsize, 'hanning')

    fr = sr / hopsize
    twopi = np.pi * 2
    delay = 0.0055 + 5e-3 * np.cos(np.arange(spec.shape[1]) * twopi * 0.1 / fr)

    for i in range(spec.shape[1]):
        spec[:, i] = specomb(spec[:, i], 0.6, delay[i], 0.9, sr)
    output = istft(spec, hopsize, 'hanning')
    write_wav(outfile, output, sr)
Exemplo n.º 27
0
def prepare_wav(track_list, sub_dirname, args):

    dirpath = os.path.join(args.dst_dir, sub_dirname)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    for track in track_list:
        name = track.name
        rate = track.rate

        vocal = monauralize(track.sources['vocals'].audio)
        mix = monauralize(track.audio)

        path = os.path.join(dirpath, name)
        print(path)
        write_wav(path + '.wav', np.stack((vocal, mix)), rate)
Exemplo n.º 28
0
def make_tone(x, f1, f2):
    y = gen_tone(x, f1, f2)
    tmpfn = "tmp.wav"

    tone_fn = path.join('wavs', 'tone%i.wav' % i)
    write_wav(tone_fn, y, SR, norm=True)

    reverbed_fn = path.join('wavs', 'reverbed_tone%i.wav' % i)
    cmd = "sox {} {} gain -3 reverb".format(tone_fn, reverbed_fn)
    check_call(cmd, shell=True)

    combined_fn = path.join('wavs', 'combined_tone%i.wav' % i)
    cmd = "sox {} {} pad 1 0".format(tone_fn, tmpfn)
    check_call(cmd, shell=True)
    cmd = "sox -m {} {} {}".format(BASE_FN, tmpfn, combined_fn)
    check_call(cmd, shell=True)
Exemplo n.º 29
0
def generate(output_path=OUTPUT_PATH,
             summary_path=SUMMARY_PATH,
             hparams=HPARAMS):
    tf.logging.set_verbosity(tf.logging.INFO)
    assert os.path.exists(summary_path), 'Summary directory does not exist...'

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    tf.logging.log(tf.logging.INFO, 'Build model...')
    model = wave_net.WaveNet(hparams)
    inputs = tf.placeholder(dtype=tf.int32, shape=[1, model.receptive_field])
    model.build(inputs)

    saver = tf.train.Saver(
        var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))

    tf.logging.log(tf.logging.INFO, 'Start session...')
    with tf.Session() as sess:
        ckpt = tf.train.latest_checkpoint(summary_path)
        saver.restore(sess, ckpt)

        num_samples = hparams.seconds_to_generate * 16000
        initial_input = np.random.randint(low=0,
                                          high=hparams.bin_size,
                                          size=[1, model.receptive_field])
        samples = np.zeros([1, num_samples])
        samples[0, 0:model.receptive_field] = initial_input

        start = time.time()

        for i in range(model.receptive_field, num_samples):
            tf.logging.log_every_n(tf.logging.INFO,
                                   'Generated sample %d/%d' % (i, num_samples),
                                   n=100)
            generated = sess.run(
                model.generated,
                feed_dict={inputs: samples[:, (i - model.receptive_field):i]})
            samples[0, i] = generated[0, -1].argmax(axis=-1)

        end = time.time()

    print('Generated %d in %.1f seconds...' %
          (hparams.seconds_to_generate, end - start))
    audio = dequantize(samples, hparams.bin_size).squeeze()
    print('Write file...')
    write_wav(os.path.join(output_path, 'audio.wav'), audio, 16000)
Exemplo n.º 30
0
def saveAudioBatch(data,
                   path,
                   basename,
                   sr=16000,
                   latents=None,
                   overwrite=False):
    from librosa.util.utils import ParameterError
    # outdata = resizeAudioTensor(data, orig_sr, target_sr)
    # taudio.save(path, outdata, sample_rate=target_sr)

    data = list(data)  #LW it was a map

    # if no (or wrong) latents, we'll use zerors to zip tp looping over enumeration still works
    if latents != None and len(latents) == len(data):
        zdata = zip(data, latents)
        print(
            "saveAudioBatch: zipping audio with latents (and will write param files)"
        )
    else:
        zdata = zip(data, [0] * len(data))
        print(
            "saveAudioBatch: zipping audio with naughts for latents (and will not write param files)"
        )

    try:
        for i, (audio, params) in enumerate(zdata):
            #for i, audio in enumerate(data): #LW

            if type(audio) != np.ndarray:
                audio = np.array(audio, float)

            out_path = os.path.join(path, f'{basename}_{i}.wav')

            # also get path/file names for parameter pytorch and text files
            param_out_path = os.path.join(path, f'{basename}_{i}.pt')
            txt_param_out_path = os.path.join(path, f'{basename}_{i}.txt')

            if not os.path.exists(out_path) or overwrite:
                write_wav(out_path, audio.astype(float), sr)
                if latents != None:
                    torch.save(params, param_out_path)
                    np.savetxt(txt_param_out_path, params.cpu().numpy())
            else:
                print(f"saveAudioBatch: File {out_path} exists. Skipping...")
                continue
    except ParameterError as pe:
        print(pe)
# -*- coding: utf-8 -*-
from machineLearningHelper import getLearningArrays
from librosa.output import write_wav
import numpy as np

samplerate = 44100

instances, classifications = getLearningArrays(useToySounds=True)

screams = []
notScreams = []

numInstances = len(instances)

for i in xrange(numInstances):
    if classifications[i]:
        screams.extend(instances[i])
    else:
        notScreams.extend(instances[i])

screams = np.array(screams)
notScreams = np.array(notScreams)

write_wav('./test_sounds/concatenated_screams.wav', screams, 44100, norm=False)
write_wav('./test_sounds/concatenated_not_screams.wav', notScreams, 44100, norm=False)
Exemplo n.º 32
0
def SaveAudio(fname, mag, phase):
    y = istft(mag*phase, hop_length=C.H, win_length=C.FFT_SIZE)
    write_wav(fname, y, C.SR, norm=True)