def _transform_audio(self): waveform, rate = load(os.path.join(data_dir, self.file)) new_rate = rate / 100 resampled = Resample(rate, new_rate)(waveform) self.stft = self._get_stft(resampled, new_rate) self.mfcc = self._get_mfcc(resampled, new_rate)
def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False): """ The parameters are by default set up to do well on a 16kHz signal. A different frequency may require different hop_length and n_fft (e.g. doubling frequency --> doubling hop_length and doubling n_fft) """ self.cut_silence = cut_silence self.sr = input_sr self.new_sr = output_sr self.hop_length = hop_length self.n_fft = n_fft self.mel_buckets = melspec_buckets self.vad = VoiceActivityDetection( sample_rate=input_sr ) # This needs heavy tweaking, depending of the data self.mu_encode = MuLawEncoding() self.mu_decode = MuLawDecoding() self.meter = pyln.Meter(input_sr) self.final_sr = input_sr if output_sr is not None and output_sr != input_sr: self.resample = Resample(orig_freq=input_sr, new_freq=output_sr) self.final_sr = output_sr else: self.resample = lambda x: x
def tokenize(self, audio_path): audio_input, sampling_rate = sf.read(audio_path) resampler = Resample(sampling_rate) audio_input = resampler(th.tensor(audio_input)) input_values = self.tokenizer(audio_input, return_tensors="pt").input_values return input_values
def from_path(self, path: str, return_two=False, return_mfcc=False): logger.info(f"Processing {path} to tensor of spectrogram") # This comes in -1, 1 normalized waveform, inp_freq = torchaudio.load(path) waveform = waveform.mean(dim=0, keepdims=True) waveform = Resample(inp_freq, self.sample_rate)(waveform) n_samples = waveform.shape[1] min_samples = self.min_samples * 2 if return_two else self.min_samples if n_samples < min_samples: raise AudioTooShortError( f"Input must be at least {self.seconds * 2 if return_two else self.seconds} seconds long" ) first_idx = torch.randint(0, n_samples - min_samples, (1,)) first_waveform = waveform[:, first_idx : (first_idx + self.min_samples)] # type: ignore first_sgram = self.forward(first_waveform) if return_mfcc: first_mfcc = self.get_mfcc(first_waveform) if return_two: second_idx = torch.randint(first_idx.item() + self.min_samples, n_samples - self.min_samples, (1,)) # type: ignore second_waveform = waveform[:, second_idx : (second_idx + self.min_samples)] second_sgram = self.forward(second_waveform) if return_mfcc: second_mfcc = self.get_mfcc(second_waveform) return first_sgram, first_mfcc, second_sgram, second_mfcc return first_sgram, second_sgram if return_mfcc: return first_sgram, first_mfcc return first_sgram
class AudioFile: def __init__(self, filename: str, transcription: str, pronunciation_dictionary: PronunciationDictionary, fileobj: Optional[BinaryIO] = None, wavobj: Optional[Tuple[Tensor, int]] = None, offset: int = 0): self.filename = filename self.pronunciation_dictionary = pronunciation_dictionary self.offset = offset self.load_audio(fileobj, wavobj) self.transcription, self.words = pronunciation_dictionary.spell_sentence(transcription, return_words=True) self.tensor_transcription = torch.tensor([self.pronunciation_dictionary.phonemic_mapping[x] \ for x in self.transcription]) def load_audio(self, fileobj: Optional[BinaryIO] = None, wavobj = None): if fileobj is not None: self.wav, sr = torchaudio.load(fileobj) elif wavobj is not None: self.wav, sr = wavobj else: self.wav, sr = torchaudio.load(self.filename) if self.wav.shape[0] != 1: self.wav = torch.mean(self.wav, dim=0).unsqueeze(0) if sr != 16000: self.wav = Resample(sr, 16000)(self.wav) def move_to_device(self, device:str): self.wav = self.wav.to(device) self.tensor_transcription = self.tensor_transcription.to(device)
def load(filename, sample_rate): y, source_rate = torchaudio.load(filename) if source_rate != sample_rate: resample = Resample(source_rate, sample_rate) y = resample(y) return y
def __init__(self, sr: int, n_steps: float, bins_per_octave: int = 12, p: float = 1.0): super().__init__(p=p) self.sr = sr self.n_steps = n_steps self.bins_per_octave = bins_per_octave self.resample = Resample() self.crop = Crop(0)
def predict(self, audio_path, features_path): audio_input, sampling_rate = sf.read(audio_path) resampler = Resample(sampling_rate) audio_input = resampler(th.tensor(audio_input)) input_values = self.tokenizer( audio_input, return_tensors="pt").input_values.to(self.device) hidden_state = self.raw_model(input_values).last_hidden_state hidden_state = hidden_state.flatten() # this shouldn't be here hidden_state = hidden_state.cpu().detach().numpy() np.save(features_path, hidden_state)
def preprocess(mp3): sample_rate = 16000 root_dir = Path("/home/nlpmaster/ssd-1t/corpus/TaiBible/PKL") new_dir = Path("/home/nlpmaster/ssd-1t/corpus/TaiBible/PKL_wav") y, sr = torchaudio.load(str(root_dir.joinpath(mp3))) resample = Resample(orig_freq=sr, new_freq=sample_rate) resampled_y = resample(y) wavfile = new_dir.joinpath(mp3) wavfile.parent.mkdir(exist_ok=True) torchaudio.save(str(wavfile), resampled_y, sample_rate=sample_rate) return str(wavfile)
def main(): wav, sr = sf.read(args.recording) target_sr = 16000 if USE_TORCHAUDIO_RESAMPLING: resampling_transform = Resample(orig_freq=sr, new_freq=target_sr) inputs = resampling_transform(torch.Tensor([wav])).squeeze() else: inputs = resample(wav, num=int(len(wav) * target_sr / sr)) print(wavenet.transcribe(inputs))
def make_train_dataset(dataset_dir, speakers=None): """Make the training dataset for MVAE from the VCC2018 dataset. Args: dataset_dir (str): Path of the VCC2018 dataset. speakers (List[str]): Speakers to be used. Returns: List[Tuple[torch.Tensor, torch.Tensor]]: List of spectrogram and speaker label. """ training_dir = os.path.join(dataset_dir, 'vcc2018_training') evaluation_dir = os.path.join(dataset_dir, 'vcc2018_evaluation') if speakers is None: speakers = [ speaker for speaker in os.listdir(training_dir) if speaker.startswith('VCC2') and os.path.isdir(os.path.join(training_dir, speaker)) ] resample = Resample(22050, 16000) create_spectrogram = Spectrogram(n_fft=N_FFT, hop_length=HOP_LEN) dataset = [] with torch.no_grad(): for c, speaker in enumerate(speakers): speaker_dir = os.path.join(training_dir, speaker) wav_files = [ os.path.join(speaker_dir, wav_file) for wav_file in os.listdir(speaker_dir) if os.path.splitext(wav_file)[1] == '.wav' ] speaker_dir = os.path.join(evaluation_dir, speaker) wav_files.extend([ os.path.join(speaker_dir, wav_file) for wav_file in os.listdir(speaker_dir) if os.path.splitext(wav_file)[1] == '.wav' ]) spectrogram = [] for wav_file in wav_files: sound, _ = torchaudio.load(wav_file) sound = resample(sound) spectrogram.append(create_spectrogram(sound).squeeze(0)) spectrogram = torch.cat(spectrogram, dim=1) hop_length = DATA_LEN // 4 for n in range((spectrogram.size(1) - DATA_LEN) // hop_length + 1): start = n * hop_length data = spectrogram[:, start:start + DATA_LEN] label = torch.zeros(len(speakers)) label[c] = 1 dataset.append((data, label)) return dataset
def _load(self, path, mfcc=True): try: waveform, ori_sr = torchaudio.load(path) waveform = waveform.mean(0, keepdims=True) except RuntimeError: raise Exception(f"Error loading {path}") _resample = Resample(ori_sr, self.sr) audio = _resample(waveform) if mfcc: audio = self._mfcc(audio) return audio
def main(meta_dir: str, pretrained_path: str, model_name: str = 'generator_mb'): # load model gen = load_model(model_name, pretrained_path).cuda() print(gen) print(f'Numb. Parameters : {sum(p.numel() for p in gen.parameters() if p.requires_grad)}') # make mel func mel_func = LogMelSpectrogram( settings.SAMPLE_RATE, settings.MEL_SIZE, settings.WIN_LENGTH, settings.WIN_LENGTH, settings.HOP_LENGTH, float(settings.MEL_MIN), float(settings.MEL_MAX) ).cuda() pqmf_func = PQMF().cuda() # get datasets _, valid_loader = get_datasets( meta_dir, batch_size=1, num_workers=1, crop_length=0, random_seed=1234 ) resample_func = Resample(22050, 16000).cuda() # score score_list = [] for wav, _ in tqdm(valid_loader): wav = wav.cuda() # to mel mel = mel_func(wav) with torch.no_grad(): pred_subbands = gen(mel) pred = pqmf_func.synthesis(pred_subbands) pred, wav = match_dim(pred, wav) # resample pred = resample_func(pred) wav = resample_func(wav) # to cpu wav = wav.cpu().numpy().squeeze() pred = pred.detach().cpu().numpy().squeeze() # resample item_score = pesq(16000, wav, pred.clip(-1., 1.), 'wb') score_list.append(item_score) print( f'mean : {np.mean(score_list)}, std : {np.std(score_list)}, ' f'min : {np.min(score_list)}, max : {np.max(score_list)}' )
def __init__(self, data_dir, meta_path, pre_load=True): self.data_dir = data_dir self.pre_load = pre_load with open(meta_path, 'r') as f: self.data = json.load(f) self.class_dict = self.data['labels'] self.class_num = len(self.class_dict) self.meta_data = self.data['meta_data'] _, origin_sr = torchaudio.load( path_join(self.data_dir, self.meta_data[0]['path'])) self.resampler = Resample(origin_sr, SAMPLE_RATE) if self.pre_load: self.wavs = self._load_all()
def __init__(self, augs_list, cap=3, resample=True, osr=44100, nsr=16000, sec=5, stretch_p=0.5): self.augs_list = augs_list self.cap = cap self.resampler = Resample(orig_freq=osr, new_freq=nsr) self.sampler = SameSize(sec * nsr) self.wav_stretcher = Stretcher(p=stretch_p)
def spectrogram_from_audio(audio: Tensor, sample_rate: int, resample_rate: int, mel_filters: int, seconds: int) -> Tensor: resampled_audio = Resample(orig_freq=sample_rate, new_freq=resample_rate)(audio) mono_audio = mean(resampled_audio, dim=0, keepdim=True) mel_transform = MelSpectrogram(sample_rate=resample_rate, n_mels=mel_filters) spectrogram = mel_transform(mono_audio) log_spectrogram = AmplitudeToDB()(spectrogram) original_length = log_spectrogram.shape[2] length = seconds * (resample_rate // mel_transform.hop_length) return pad(log_spectrogram, (0, length - original_length)) if original_length < length \ else log_spectrogram[:, :, :length]
def preprocess_as_spec(path_wav: Path, id: ItemIdJSSS, dir_dataset: Path, new_sr: Optional[int] = None) -> None: """Transform JSSS corpus contents into spectrogram Tensor. Before this preprocessing, corpus contents should be deployed. """ waveform, _sr_orig = load_wav(path_wav) if new_sr is not None: waveform = Resample(_sr_orig, new_sr)(waveform) # :: [1, Length] -> [Length,] waveform: Tensor = waveform[0, :] # defaults: hop_length = win_length // 2, window_fn = torch.hann_window, power = 2 spec: Tensor = Spectrogram(254)(waveform) path_spec = get_dataset_spec_path(dir_dataset, id) path_spec.parent.mkdir(parents=True, exist_ok=True) save(spec, path_spec)
def resample_wav( input_path: Path, output_path: Path, stereo_to_mono: bool = True, sampling_rate: int = 22050 ): waveform, original_sampling_rate = torchaudio.load(input_path) waveform = Resample(original_sampling_rate, sampling_rate)(waveform) if stereo_to_mono and len(waveform.shape) == 2: # waveform.shape==(channels, time) - we have to trim to 1 channel # note: we could also take mean from 2 channels but this is not guaranteed to work: # https://dsp.stackexchange.com/questions/2484/converting-from-stereo-to-mono-by-averaging waveform = waveform[0].unsqueeze(0) torchaudio.save(str(output_path), waveform, sampling_rate)
def __init__(self, sample_rate, n_fft, top_db, max_perc): super().__init__() self.time_stretch = TimeStretch(hop_length=None, n_freq=n_fft//2+1) self.stft = Spectrogram(n_fft=n_fft, power=None) self.com_norm = ComplexNorm(power=2.) self.fm = FrequencyMasking(50) self.tm = TimeMasking(50) self.mel_specgram = MelSpectrogram(sample_rate, n_fft=n_fft, f_max=8000) self.AtoDB= AmplitudeToDB(top_db=top_db) self.max_perc = max_perc self.sample_rate = sample_rate self.resamples = [ Resample(sample_rate, sample_rate*0.6), Resample(sample_rate, sample_rate*0.7), Resample(sample_rate, sample_rate*0.8), Resample(sample_rate, sample_rate*0.9), Resample(sample_rate, sample_rate*1), Resample(sample_rate, sample_rate*1.1), Resample(sample_rate, sample_rate*1.2), Resample(sample_rate, sample_rate*1.3), Resample(sample_rate, sample_rate*1.4) ]
def preprocess_as_wave(path_wav: Path, id: ItemIdJSSS, dir_dataset: Path, new_sr: Optional[int] = None) -> None: """Transform JSSS corpus contents into waveform Tensor. Before this preprocessing, corpus contents should be deployed. """ waveform, _sr_orig = load_wav(path_wav) if new_sr is not None: waveform = Resample(_sr_orig, new_sr)(waveform) # :: [1, Length] -> [Length,] waveform: Tensor = waveform[0, :] path_wave = get_dataset_wave_path(dir_dataset, id) path_wave.parent.mkdir(parents=True, exist_ok=True) save(waveform, path_wave)
def postprocess(feats, curr_sample_rate, normalize=True): if args.sample_rate != curr_sample_rate: feats = Resample(curr_sample_rate, args.sample_rate)(feats) if feats.dim() == 2: feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() if normalize: with torch.no_grad(): feats = F.layer_norm(feats, feats.shape) return feats
def _load(self, line, mfcc=True, wav_name=None): if wav_name: waveform, ori_sr = torchaudio.load(wav_name) waveform = waveform.mean(0, keepdims=True) else: try: waveform, ori_sr = torchaudio.load( line.audio_fn, frame_offset=line.start_frame, num_frames=line.nframes) waveform = waveform.mean(0, keepdims=True) except RuntimeError: raise Exception(f"Error loading {line.audio_fn}") _resample = Resample(ori_sr, self.sr) audio = _resample(waveform) # print('audio',audio.shape) if mfcc: audio = self._mfcc(audio) return audio
def text_to_instance(self, data: Tuple[str, str]) -> Instance: # type: ignore # pylint: disable=arguments-differ wav_file, text = data if callable(wav_file): y = wav_file() else: y, orig_freq = torchaudio.load(wav_file) if orig_freq != self._sample_rate: resample = Resample(orig_freq=orig_freq, new_freq=16000) y = resample(y) source_array = torchaudio.compliance.kaldi.fbank( y, num_mel_bins=80, use_energy=True).detach() #source_array = self._mel_spectrogram(y).detach() source_array, src_len = pad_and_stack(source_array, self.input_stack_rate, self.model_stack_rate, pad_mode=self._pad_mode) source_length_field = LabelField(src_len, skip_indexing=True) source_field = TensorField(source_array) if text is not None: target = self._target_tokenizer.tokenize(text) if self._target_add_start_end_token: target.insert(0, Token(START_SYMBOL)) target.append(Token(END_SYMBOL)) target_field = TextField(target, self._target_token_indexers) return Instance({ "source_features": source_field, "target_tokens": target_field, "source_lengths": source_length_field }) else: return Instance({ "source_features": source_field, "source_lengths": source_length_field })
def __init__(self, root: str, training: bool = True, frequency: int = 16000, max_length: int = 280, transform=None, return_length: bool = False): self.data = [] self.return_length = return_length if transform is None: self.transform = MFCC(frequency) else: self.transform = transform self.training = training self.filenames = [] self.max_length = max_length if frequency != 16000: self.resampler = Resample(orig_freq=16000, new_freq=frequency) if training: df_labels = pd.read_csv(root + "train_label.csv") root = root + "Train/" self.labels = [] else: root = root + "Public_Test/" for filename in os.listdir(root): if filename.endswith(".wav"): self.filenames.append(filename) input_audio, sample_rate = load_wav(root + filename) if frequency != 16000: input_audio = self.resampler(input_audio) self.data.append(input_audio) if training: self.labels.append( df_labels.loc[df_labels["File"] == filename, "Label"].values.item())
def _audio_transform(self): """ This function contains example transforms using both PyTorchVideo and TorchAudio in the same Callable. """ args = self.args n_fft = int( float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size ) hop_length = int( float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size ) eps = 1e-10 return ApplyTransformToKey( key="audio", transform=Compose( [ Resample( orig_freq=args.audio_raw_sample_rate, new_freq=args.audio_resampled_rate, ), MelSpectrogram( sample_rate=args.audio_resampled_rate, n_fft=n_fft, hop_length=hop_length, n_mels=args.audio_num_mels, center=False, ), Lambda(lambda x: x.clamp(min=eps)), Lambda(torch.log), UniformTemporalSubsample(args.audio_mel_num_subsample), Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F) Lambda( lambda x: x.view(1, x.size(0), 1, x.size(1)) ), # (T, F) -> (1, T, 1, F) Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)), ] ), )
def download_vctk(destination, tmp_dir=None, device="cpu"): """Download dataset and perform resample to 16000 Hz. Arguments --------- destination : str Place to put final zipped dataset. tmp_dir : str Location to store temporary files. Will use `tempfile` if not provided. device : str Passed directly to pytorch's ``.to()`` method. Used for resampling. """ dataset_name = "noisy-vctk-16k" if tmp_dir is None: tmp_dir = tempfile.gettempdir() final_dir = os.path.join(tmp_dir, dataset_name) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(final_dir): os.mkdir(final_dir) prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/" noisy_vctk_urls = [ prefix + "clean_testset_wav.zip", prefix + "noisy_testset_wav.zip", prefix + "testset_txt.zip", prefix + "clean_trainset_28spk_wav.zip", prefix + "noisy_trainset_28spk_wav.zip", prefix + "trainset_28spk_txt.zip", ] zip_files = [] for url in noisy_vctk_urls: filename = os.path.join(tmp_dir, url.split("/")[-1]) zip_files.append(filename) if not os.path.isfile(filename): logger.info("Downloading " + url) with urllib.request.urlopen(url) as response: with open(filename, "wb") as tmp_file: logger.info("... to " + tmp_file.name) shutil.copyfileobj(response, tmp_file) # Unzip for zip_file in zip_files: logger.info("Unzipping " + zip_file) shutil.unpack_archive(zip_file, tmp_dir, "zip") os.remove(zip_file) # Move transcripts to final dir shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir) shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir) # Downsample dirs = [ "noisy_testset_wav", "clean_testset_wav", "noisy_trainset_28spk_wav", "clean_trainset_28spk_wav", ] downsampler = Resample(orig_freq=48000, new_freq=16000) for directory in dirs: logger.info("Resampling " + directory) dirname = os.path.join(tmp_dir, directory) # Make directory to store downsampled files dirname_16k = os.path.join(final_dir, directory + "_16k") if not os.path.isdir(dirname_16k): os.mkdir(dirname_16k) # Load files and downsample for filename in get_all_files(dirname, match_and=[".wav"]): signal, rate = torchaudio.load(filename) downsampled_signal = downsampler(signal.view(1, -1).to(device)) # Save downsampled file torchaudio.save( os.path.join(dirname_16k, filename[-12:]), downsampled_signal[0].cpu(), sample_rate=16000, channels_first=False, ) # Remove old file os.remove(filename) # Remove old directory os.rmdir(dirname) logger.info("Zipping " + final_dir) final_zip = shutil.make_archive( base_name=final_dir, format="zip", root_dir=os.path.dirname(final_dir), base_dir=os.path.basename(final_dir), ) logger.info(f"Moving {final_zip} to {destination}") shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
async def __resample_file(self, array, original_sr, target_sr): resampling_transform = Resample(orig_freq=original_sr, new_freq=target_sr) sample = resampling_transform(torch.Tensor([array])).squeeze() return sample
return base64.urlsafe_b64decode(encoded[21:]) def create_tmp_file(binary_obj): with open('templates/tmp.webm', 'wb') as f: f.write(binary_obj) def convert_tmp_file(): cmd = 'ffmpeg -y -i templates/tmp.webm -vn templates/tmp.wav' subprocess.call(cmd.split()) def load_as_tensor(transform): wf, sampling_rate = torchaudio.load('templates/tmp.wav') wf = transform(wf) return wf resample = Resample(48000, 16000) def pipeline(encoded): create_tmp_file(decode_base64(encoded)) convert_tmp_file() return load_as_tensor(resample) # ============================================================ class Model: def __init__(self, classifier_config_path): clf_cfg = Hparam(classifier_config_path) cpc_cfg = Hparam(clf_cfg.model.cpc_config_path) self.device = clf_cfg.train.device speakers_bank = pickle.load(open('templates/mean_speakers_vecs_dict.pkl', 'rb')) self.speakers, self.mean_vecs = list(speakers_bank.keys()), torch.stack(list(speakers_bank.values()), dim=0)
def process_utterance(in_dir, out_dir, spker, basename): wav_path = os.path.join(in_dir, 'wav48', spker, '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', spker, '{}.TextGrid'.format(basename)) if not os.path.exists(tg_path): return None # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files sr, wav = read(wav_path) wav = torch.tensor(wav.astype(np.float32)) if sr != hp.sampling_rate: wav = Resample(orig_freq=sr, new_freq=hp.sampling_rate)(wav) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)] # Compute fundamental frequency f0, _ = pw.dio(wav.numpy().astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav) mel_spectrogram = mel_spectrogram.cpu().numpy().astype( np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # if the shape is not right, you can check get_alignment function try: assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1]) except AssertionError as e: print("duration problem: {}".format(wav_path)) return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) try: return '|'.join([basename, text]), max(f0), min([ f for f in f0 if f != 0 ]), max(energy), min(energy), mel_spectrogram.shape[1] except: print(basename) return None
import numpy as np import librosa import math import torch import torchaudio from torchaudio.transforms import Spectrogram, MelSpectrogram, AmplitudeToDB, ComplexNorm, Resample from torchaudio.functional import lowpass_biquad, highpass_biquad from tqdm import tqdm import matplotlib.pyplot as plt from pathlib import Path from multiprocessing import Pool, cpu_count def cov_pk(info): path, resample, rate = info waveform = torch.load(path) waveform = resample(waveform) ebird_code = path.parent.name torch.save(waveform, f'../../dataset/tensor_audio/{ebird_code}/re{rate}-{path.stem}.tensor') NUM_WORKERS = cpu_count() sr = 32_000 for i in [0.8, 0.9, 1.1, 1.2]: resample = Resample(sr, sr*i) for directory in tqdm(Path('../../dataset/tensor_audio').iterdir()): file_paths = list(directory.iterdir()) with Pool(5) as p: p.map(cov_pk, (file_paths, resample, i))