def process_row(row): audio_file = row['audio_filepath'] global sample_rate data_orig = AudioSegment.from_file(audio_file, target_sr=sample_rate, offset=0) for snr in row['snrs']: min_snr_db = snr max_snr_db = snr global att_factor perturber = NoisePerturbation( manifest_path=row['noise_manifest'], min_snr_db=min_snr_db, max_snr_db=max_snr_db, rng=rng ) out_dir = get_out_dir_name( row['out_dir'], os.path.splitext(os.path.basename(row['input_manifest']))[0], os.path.splitext(os.path.basename(row['noise_manifest']))[0], snr, ) os.makedirs(out_dir, exist_ok=True) out_f = os.path.join(out_dir, os.path.basename(audio_file)) if os.path.exists(out_f): continue data = copy.deepcopy(data_orig) perturber.perturb(data) max_level = np.max(np.abs(data.samples)) norm_factor = att_factor / max_level new_samples = norm_factor * data.samples sf.write(out_f, new_samples.transpose(), sample_rate)
def read_one_audiosegment(manifest, target_sr, rng, tarred_audio=False, audio_dataset=None): if tarred_audio: if audio_dataset is None: raise TypeError("Expected augmentation dataset but got None") audio_file, file_id = next(audio_dataset) manifest_idx = manifest.mapping[file_id] manifest_entry = manifest[manifest_idx] offset = 0 if manifest_entry.offset is None else manifest_entry.offset duration = 0 if manifest_entry.duration is None else manifest_entry.duration else: audio_record = rng.sample(manifest.data, 1)[0] audio_file = audio_record.audio_file offset = 0 if audio_record.offset is None else audio_record.offset duration = 0 if audio_record.duration is None else audio_record.duration return AudioSegment.from_file(audio_file, target_sr=target_sr, offset=offset, duration=duration)
def __getitem__(self, index): sample = self.data[index] if not self.load_precomputed_mel: features = AudioSegment.segment_from_file( sample["audio_filepath"], n_segments=self.n_segments if self.n_segments is not None else -1, trim=self.trim, ) features = torch.tensor(features.samples) audio, audio_length = features, torch.tensor( features.shape[0]).long() return audio, audio_length else: features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor( features.shape[0]).long() mel = torch.load(sample["mel_filepath"]) frames = math.ceil(self.n_segments / self.hop_length) if len(audio) > self.n_segments: start = random.randint(0, mel.shape[1] - frames - 2) mel = mel[:, start:start + frames] audio = audio[start * self.hop_length:(start + frames) * self.hop_length] else: mel = torch.nn.functional.pad(mel, (0, frames - mel.shape[1])) audio = torch.nn.functional.pad( audio, (0, self.n_segments - len(audio))) return audio, len(audio), mel
def perturb(self, data): att_factor = 0.8 max_level = np.max(np.abs(data._samples)) norm_factor = att_factor / max_level norm_samples = norm_factor * data._samples orig_f = NamedTemporaryFile(suffix=".wav") sf.write(orig_f.name, norm_samples.transpose(), 16000) codec_ind = random.randint(0, len(self._codecs) - 1) if self._codecs[codec_ind] == "amr-nb": transcoded_f = NamedTemporaryFile(suffix="_amr.wav") rates = list(range(0, 8)) rate = rates[random.randint(0, len(rates) - 1)] _ = subprocess.check_output( f"sox {orig_f.name} -V0 -C {rate} -t amr-nb - | sox -t amr-nb - -V0 -b 16 -r 16000 {transcoded_f.name}", shell=True, ) elif self._codecs[codec_ind] == "g711": transcoded_f = NamedTemporaryFile(suffix="_g711.wav") _ = subprocess.check_output( f"sox {orig_f.name} -V0 -r 8000 -c 1 -e a-law {transcoded_f.name}", shell=True) new_data = AudioSegment.from_file(transcoded_f.name, target_sr=16000) data._samples = new_data._samples[0:data._samples.shape[0]] return
def filter(manifest): """ Filters out samples that do not satisfy specified threshold values. Args: manifest: path to .json manifest """ original_duration = 0 if args.audio_dir: audio_files = glob(f"{os.path.abspath(args.audio_dir)}/*") for audio in audio_files: try: audio_data = AudioSegment.from_file(audio) duration = len(audio_data._samples) / audio_data._sample_rate original_duration += duration except Exception as e: logging.info(f"Skipping {audio} -- {e}") _apply_filters( manifest=manifest, manifest_out=manifest.replace(".json", "_filtered.json"), max_cer=args.max_cer, max_wer=args.max_wer, max_edge_cer=args.max_edge_cer, max_len_diff_ratio=args.max_len_diff_ratio, max_dur=args.max_duration, original_duration=original_duration, )
def process(self, file_path, offset=0, duration=0, trim=False, orig_sr=None): audio = AudioSegment.from_file( file_path, target_sr=self.sample_rate, int_values=self.int_values, offset=offset, duration=duration, trim=trim, orig_sr=orig_sr, ) return self.process_segment(audio)
def __getitem__(self, index): """ Given a index, returns audio and audio_length of the corresponding element. Audio clips of n_segments are randomly chosen if the audio is longer than n_segments. """ example = self.collection[index] features = AudioSegment.segment_from_file(example.audio_file, n_segments=self.n_segments, trim=self.trim,) features = torch.tensor(features.samples) audio, audio_length = features, torch.tensor(features.shape[0]).long() truncate = audio_length % self.truncate_to if truncate != 0: audio_length -= truncate.long() audio = audio[:audio_length] return audio, audio_length
def process_audio(in_file: str, wav_file: str = None, cut_prefix: int = 0, sample_rate: int = 16000): """Process audio file: .mp3 to .wav conversion and cut a few seconds from the beginning of the audio Args: in_file: path to the .mp3 or .wav file for processing wav_file: path to the output .wav file cut_prefix: number of seconds to cut from the beginning of the audio file sample_rate: target sampling rate """ try: audio = AudioSegment.from_file(in_file, target_sr=sample_rate, offset=cut_prefix) wav.write(wav_file, data=audio._samples, rate=sample_rate) except Exception as e: print(f'{in_file} skipped - {e}')