Пример #1
0
def process_row(row):
    audio_file = row['audio_filepath']
    global sample_rate
    data_orig = AudioSegment.from_file(audio_file, target_sr=sample_rate, offset=0)
    for snr in row['snrs']:
        min_snr_db = snr
        max_snr_db = snr
        global att_factor
        perturber = NoisePerturbation(
            manifest_path=row['noise_manifest'], min_snr_db=min_snr_db, max_snr_db=max_snr_db, rng=rng
        )
        out_dir = get_out_dir_name(
            row['out_dir'],
            os.path.splitext(os.path.basename(row['input_manifest']))[0],
            os.path.splitext(os.path.basename(row['noise_manifest']))[0],
            snr,
        )
        os.makedirs(out_dir, exist_ok=True)
        out_f = os.path.join(out_dir, os.path.basename(audio_file))
        if os.path.exists(out_f):
            continue
        data = copy.deepcopy(data_orig)
        perturber.perturb(data)

        max_level = np.max(np.abs(data.samples))

        norm_factor = att_factor / max_level
        new_samples = norm_factor * data.samples
        sf.write(out_f, new_samples.transpose(), sample_rate)
Пример #2
0
def read_one_audiosegment(manifest,
                          target_sr,
                          rng,
                          tarred_audio=False,
                          audio_dataset=None):

    if tarred_audio:
        if audio_dataset is None:
            raise TypeError("Expected augmentation dataset but got None")
        audio_file, file_id = next(audio_dataset)
        manifest_idx = manifest.mapping[file_id]
        manifest_entry = manifest[manifest_idx]

        offset = 0 if manifest_entry.offset is None else manifest_entry.offset
        duration = 0 if manifest_entry.duration is None else manifest_entry.duration

    else:
        audio_record = rng.sample(manifest.data, 1)[0]
        audio_file = audio_record.audio_file
        offset = 0 if audio_record.offset is None else audio_record.offset
        duration = 0 if audio_record.duration is None else audio_record.duration

    return AudioSegment.from_file(audio_file,
                                  target_sr=target_sr,
                                  offset=offset,
                                  duration=duration)
Пример #3
0
    def __getitem__(self, index):
        sample = self.data[index]

        if not self.load_precomputed_mel:
            features = AudioSegment.segment_from_file(
                sample["audio_filepath"],
                n_segments=self.n_segments
                if self.n_segments is not None else -1,
                trim=self.trim,
            )
            features = torch.tensor(features.samples)
            audio, audio_length = features, torch.tensor(
                features.shape[0]).long()

            return audio, audio_length
        else:
            features = self.featurizer.process(sample["audio_filepath"],
                                               trim=self.trim)
            audio, audio_length = features, torch.tensor(
                features.shape[0]).long()

            mel = torch.load(sample["mel_filepath"])
            frames = math.ceil(self.n_segments / self.hop_length)

            if len(audio) > self.n_segments:
                start = random.randint(0, mel.shape[1] - frames - 2)
                mel = mel[:, start:start + frames]
                audio = audio[start * self.hop_length:(start + frames) *
                              self.hop_length]
            else:
                mel = torch.nn.functional.pad(mel, (0, frames - mel.shape[1]))
                audio = torch.nn.functional.pad(
                    audio, (0, self.n_segments - len(audio)))

            return audio, len(audio), mel
Пример #4
0
    def perturb(self, data):
        att_factor = 0.8
        max_level = np.max(np.abs(data._samples))
        norm_factor = att_factor / max_level
        norm_samples = norm_factor * data._samples
        orig_f = NamedTemporaryFile(suffix=".wav")
        sf.write(orig_f.name, norm_samples.transpose(), 16000)

        codec_ind = random.randint(0, len(self._codecs) - 1)
        if self._codecs[codec_ind] == "amr-nb":
            transcoded_f = NamedTemporaryFile(suffix="_amr.wav")
            rates = list(range(0, 8))
            rate = rates[random.randint(0, len(rates) - 1)]
            _ = subprocess.check_output(
                f"sox {orig_f.name} -V0 -C {rate} -t amr-nb - | sox -t amr-nb - -V0 -b 16 -r 16000 {transcoded_f.name}",
                shell=True,
            )
        elif self._codecs[codec_ind] == "g711":
            transcoded_f = NamedTemporaryFile(suffix="_g711.wav")
            _ = subprocess.check_output(
                f"sox {orig_f.name} -V0  -r 8000 -c 1 -e a-law {transcoded_f.name}",
                shell=True)

        new_data = AudioSegment.from_file(transcoded_f.name, target_sr=16000)
        data._samples = new_data._samples[0:data._samples.shape[0]]
        return
Пример #5
0
def filter(manifest):
    """
    Filters out samples that do not satisfy specified threshold values.

    Args:
        manifest: path to .json manifest
    """
    original_duration = 0
    if args.audio_dir:
        audio_files = glob(f"{os.path.abspath(args.audio_dir)}/*")
        for audio in audio_files:
            try:
                audio_data = AudioSegment.from_file(audio)
                duration = len(audio_data._samples) / audio_data._sample_rate
                original_duration += duration
            except Exception as e:
                logging.info(f"Skipping {audio} -- {e}")

    _apply_filters(
        manifest=manifest,
        manifest_out=manifest.replace(".json", "_filtered.json"),
        max_cer=args.max_cer,
        max_wer=args.max_wer,
        max_edge_cer=args.max_edge_cer,
        max_len_diff_ratio=args.max_len_diff_ratio,
        max_dur=args.max_duration,
        original_duration=original_duration,
    )
Пример #6
0
 def process(self, file_path, offset=0, duration=0, trim=False, orig_sr=None):
     audio = AudioSegment.from_file(
         file_path,
         target_sr=self.sample_rate,
         int_values=self.int_values,
         offset=offset,
         duration=duration,
         trim=trim,
         orig_sr=orig_sr,
     )
     return self.process_segment(audio)
Пример #7
0
    def __getitem__(self, index):
        """
        Given a index, returns audio and audio_length of the corresponding element. Audio clips of n_segments are
        randomly chosen if the audio is longer than n_segments.
        """
        example = self.collection[index]
        features = AudioSegment.segment_from_file(example.audio_file, n_segments=self.n_segments, trim=self.trim,)
        features = torch.tensor(features.samples)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()

        truncate = audio_length % self.truncate_to
        if truncate != 0:
            audio_length -= truncate.long()
            audio = audio[:audio_length]

        return audio, audio_length
Пример #8
0
def process_audio(in_file: str,
                  wav_file: str = None,
                  cut_prefix: int = 0,
                  sample_rate: int = 16000):
    """Process audio file: .mp3 to .wav conversion and cut a few seconds from the beginning of the audio

    Args:
        in_file: path to the .mp3 or .wav file for processing
        wav_file: path to the output .wav file
        cut_prefix: number of seconds to cut from the beginning of the audio file
        sample_rate: target sampling rate
    """
    try:
        audio = AudioSegment.from_file(in_file,
                                       target_sr=sample_rate,
                                       offset=cut_prefix)
        wav.write(wav_file, data=audio._samples, rate=sample_rate)
    except Exception as e:
        print(f'{in_file} skipped - {e}')