Пример #1
0
 def __init__(
     self,
     n_fft=1024,
     hop_length=256,
     win_length=1024,
     n_bins=84,
     sampling_rate=22050,
 ):
     super().__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     window = torch.hann_window(win_length).float()
     cqt_basis, lengths = librosa_cqt_fn(sampling_rate,
                                         n_bins=n_bins,
                                         filter_scale=0.5)
     cqt_basis = cqt_basis.astype(dtype=np.float32)
     cqt_basis = torch.from_numpy(cqt_basis).float()
     self.register_buffer("cqt_basis", cqt_basis)
     self.register_buffer("window", window)
     self.n_fft = n_fft
     self.n_bins = n_bins
     self.hop_length = hop_length
     self.win_length = win_length
     self.sampling_rate = sampling_rate
     self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate,
                                             n_bins=84,
                                             hop_length=hop_length,
                                             output_format='Magnitude',
                                             pad_mode='constant',
                                             device='cuda:0',
                                             verbose=False,
                                             trainable=False)
Пример #2
0
def main():
    args = parse_args()
    save_type = args.save_type
    spec_layer = Spectrogram.CQT1992v2(sr=22050, n_bins=84, hop_length=256, pad_mode='constant', device='cuda:0', verbose=False, trainable=False, output_format='Magnitude')
    transformedSet = AudioDataset('input_audio.txt', 22050 * 4, sampling_rate=22050, augment=False)
    transformedLoader = DataLoader(transformedSet, batch_size=1)
    transformedVoc = []
    f = open('input_audio.txt', 'r')
    lines = f.readlines()
    lines = list(map(lambda s: s.strip(), lines)) #remove newline character
    lines = [track.replace('.wav', '') for track in lines] #remove .wav
    print(lines)
    if len(lines) != len(transformedLoader):
        print('Differences in wavs found and whats in input_audio.txt')
        return

    for i, x_t in enumerate(transformedLoader):
        x_t = x_t.cuda()
        s_t = spec_layer(x_t).detach()
        s_t = torch.log(torch.clamp(s_t, min=1e-5))
        transformedVoc.append(s_t.cuda())
    
    if (save_type == 'torch'):
        print('Saving WAVs as torch pt')
        for x in range(0, len(transformedVoc)):
            torch.save(transformedVoc[x], lines[x] + '.pt')
    if (save_type == 'png'):
        print('Saving WAVs as image via matplotlib')
        for x in range(0, len(transformedVoc)):
            save_spec_images(transformedVoc[x], lines[x])
Пример #3
0
 def __init__(self):
     super(Model, self).__init__()
     f_kernal = 128//network_factor
     self.STFT_layer = Spectrogram.CQT1992v2(sr=44100, fmin=27.5, n_bins=n_bins, bins_per_octave=bins_per_octave, pad_mode='constant', hop_length=HOP_LENGTH, center=True, device=device)
     self.freq_cnn1 = torch.nn.Conv2d(1,4, (f_kernal,3), stride=(8,1), padding=1)
     self.freq_cnn2 = torch.nn.Conv2d(4,8, (f_kernal,3), stride=(8,1), padding=1)
     shape = self.shape_inference(f_kernal)
     self.bilstm = torch.nn.LSTM(shape*8, shape*8, batch_first=True, bidirectional=True)
     self.pitch_classifier = torch.nn.Linear(shape*8*2, 88)
Пример #4
0
def main():
    args = parse_args()
    file_name = args.input_file
    save_path = args.save_path
    spec_layer = Spectrogram.CQT1992v2(sr=22050,
                                       n_bins=84,
                                       hop_length=256,
                                       pad_mode='constant',
                                       device='cuda:0',
                                       verbose=False,
                                       trainable=False,
                                       output_format='Magnitude')
    transformedSet = AudioConversionDataset(file_name,
                                            22050 * 4,
                                            sampling_rate=22050,
                                            augment=False)
    transformedLoader = DataLoader(transformedSet, batch_size=1)
    f = open(file_name, 'r')
    lines = f.readlines()
    lines = list(map(lambda s: s.strip(), lines))  #remove newline character
    lines = [track.replace('.wav', '') for track in lines]  #remove .wav
    lines = [track.split("/")[-1] for track in lines]
    if len(lines) != len(transformedLoader):
        print('Differences in wavs found and whats in input_audio.txt')
        return

    for i, x in tqdm(enumerate(transformedLoader),
                     ascii=True,
                     desc='Making spectrogram representations'):
        x_t = x[0]
        fname = os.path.basename(x[1][0]).replace('.wav', '')
        x_t = x_t.cuda()
        s_t = spec_layer(x_t).detach()
        s_t = torch.log(torch.clamp(s_t, min=1e-5))
        if args.save_type == 'pt':
            torch.save(s_t.cuda(), save_path + fname + '.pt')
        else:
            save_image(s_t.cuda(), save_path + fname + '.png', normalize=True)
            min_value = torch.min(s_t.cuda()).item()
            max_value = torch.max(s_t.cuda()).item()
            normalisation_dict[fname] = {"min": min_value, "max": max_value}
    with open(save_path + 'normalisation_values.json', 'w') as outfile:
        json.dump(normalisation_dict, outfile, indent=4)
Пример #5
0
 def __init__(
     self,
     hop_length=256,
     n_bins=84,
     sampling_rate=22050,
 ):
     super().__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     self.n_bins = n_bins
     self.hop_length = hop_length
     self.sampling_rate = sampling_rate
     self.spec_layer = Spectrogram.CQT1992v2(sr=sampling_rate,
                                             n_bins=84,
                                             hop_length=hop_length,
                                             output_format='Magnitude',
                                             pad_mode='constant',
                                             device='cuda:0',
                                             verbose=False,
                                             trainable=False)
Пример #6
0
        for n_mels in n_mels_ls:
            if n_mels < n_fft:
                layer = Spectrogram.MelSpectrogram(n_fft=n_fft,
                                                   n_mels=n_mels,
                                                   hop_length=512,
                                                   verbose=False,
                                                   device=device)
                start = time.time()
                for i in tqdm.tqdm(dataset):
                    i = i.to(device)
                    layer(i)
                result[f'Mel-{n_fft}-n_bins{n_mels}'] = time.time() - start
            else:
                continue

    # CQT

    for r in range(1, 11):
        layer = Spectrogram.CQT1992v2(sr=44100,
                                      n_bins=84 * r,
                                      bins_per_octave=12 * r,
                                      hop_length=512,
                                      verbose=False,
                                      device=device)
        start = time.time()
        for i in tqdm.tqdm(dataset):
            i = i.to(device)
            layer(i)
        result[f'CQT-r={r}'] = time.time() - start

    pickle.dump(result, open(Path(__file__).parent / './Pytorch_result', 'wb'))
Пример #7
0
elif args.device == "GPU":
    device = "cuda:0"
    print("using GPU")
elif args.device == "librosa":
    print("using librosa")

print(Path(__file__).parent / './y_list.npy')

y_list = np.load(Path(__file__).parent / './y_list.npy')

if args.device in ["CPU", "GPU"]:
    y_torch = torch.tensor(y_list, device=device).float()

    spec_layer = Spectrogram.CQT1992v2(sr=44100,
                                       n_bins=84,
                                       bins_per_octave=24,
                                       fmin=55,
                                       device=device)
    timing = []
    for e in range(20):
        t_start = time.time()
        spec = spec_layer(y_torch[:1000])
        spec = spec_layer(y_torch[1000:])
        time_used = time.time() - t_start
        #     print(time_used)
        timing.append(time_used)

    print("mean = ", np.mean(timing))
    print("std = ", np.std(timing))

    data = pd.DataFrame(timing, columns=['t_avg'])